Updates
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = scalar simd
|
||||
|
||||
noinst_LTLIBRARIES = libdft.la
|
||||
|
||||
libdft_la_SOURCES = bluestein.c buffered.c conf.c ct.c dftw-direct.c \
|
||||
dftw-directsq.c dftw-generic.c dftw-genericbuf.c direct.c generic.c \
|
||||
indirect.c indirect-transpose.c kdft-dif.c kdft-difsq.c kdft-dit.c \
|
||||
kdft.c nop.c plan.c problem.c rader.c rank-geq2.c solve.c vrank-geq1.c \
|
||||
zero.c codelet-dft.h ct.h dft.h
|
||||
@@ -0,0 +1,844 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = dft
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
libdft_la_LIBADD =
|
||||
am_libdft_la_OBJECTS = bluestein.lo buffered.lo conf.lo ct.lo \
|
||||
dftw-direct.lo dftw-directsq.lo dftw-generic.lo \
|
||||
dftw-genericbuf.lo direct.lo generic.lo indirect.lo \
|
||||
indirect-transpose.lo kdft-dif.lo kdft-difsq.lo kdft-dit.lo \
|
||||
kdft.lo nop.lo plan.lo problem.lo rader.lo rank-geq2.lo \
|
||||
solve.lo vrank-geq1.lo zero.lo
|
||||
libdft_la_OBJECTS = $(am_libdft_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/bluestein.Plo \
|
||||
./$(DEPDIR)/buffered.Plo ./$(DEPDIR)/conf.Plo \
|
||||
./$(DEPDIR)/ct.Plo ./$(DEPDIR)/dftw-direct.Plo \
|
||||
./$(DEPDIR)/dftw-directsq.Plo ./$(DEPDIR)/dftw-generic.Plo \
|
||||
./$(DEPDIR)/dftw-genericbuf.Plo ./$(DEPDIR)/direct.Plo \
|
||||
./$(DEPDIR)/generic.Plo ./$(DEPDIR)/indirect-transpose.Plo \
|
||||
./$(DEPDIR)/indirect.Plo ./$(DEPDIR)/kdft-dif.Plo \
|
||||
./$(DEPDIR)/kdft-difsq.Plo ./$(DEPDIR)/kdft-dit.Plo \
|
||||
./$(DEPDIR)/kdft.Plo ./$(DEPDIR)/nop.Plo ./$(DEPDIR)/plan.Plo \
|
||||
./$(DEPDIR)/problem.Plo ./$(DEPDIR)/rader.Plo \
|
||||
./$(DEPDIR)/rank-geq2.Plo ./$(DEPDIR)/solve.Plo \
|
||||
./$(DEPDIR)/vrank-geq1.Plo ./$(DEPDIR)/zero.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(libdft_la_SOURCES)
|
||||
DIST_SOURCES = $(libdft_la_SOURCES)
|
||||
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
|
||||
ctags-recursive dvi-recursive html-recursive info-recursive \
|
||||
install-data-recursive install-dvi-recursive \
|
||||
install-exec-recursive install-html-recursive \
|
||||
install-info-recursive install-pdf-recursive \
|
||||
install-ps-recursive install-recursive installcheck-recursive \
|
||||
installdirs-recursive pdf-recursive ps-recursive \
|
||||
tags-recursive uninstall-recursive
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
|
||||
distclean-recursive maintainer-clean-recursive
|
||||
am__recursive_targets = \
|
||||
$(RECURSIVE_TARGETS) \
|
||||
$(RECURSIVE_CLEAN_TARGETS) \
|
||||
$(am__extra_recursive_targets)
|
||||
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
|
||||
distdir distdir-am
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
DIST_SUBDIRS = $(SUBDIRS)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
am__relativize = \
|
||||
dir0=`pwd`; \
|
||||
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
|
||||
sed_rest='s,^[^/]*/*,,'; \
|
||||
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
|
||||
sed_butlast='s,/*[^/]*$$,,'; \
|
||||
while test -n "$$dir1"; do \
|
||||
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
|
||||
if test "$$first" != "."; then \
|
||||
if test "$$first" = ".."; then \
|
||||
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
|
||||
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
|
||||
else \
|
||||
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
|
||||
if test "$$first2" = "$$first"; then \
|
||||
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
|
||||
else \
|
||||
dir2="../$$dir2"; \
|
||||
fi; \
|
||||
dir0="$$dir0"/"$$first"; \
|
||||
fi; \
|
||||
fi; \
|
||||
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
|
||||
done; \
|
||||
reldir="$$dir2"
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = scalar simd
|
||||
noinst_LTLIBRARIES = libdft.la
|
||||
libdft_la_SOURCES = bluestein.c buffered.c conf.c ct.c dftw-direct.c \
|
||||
dftw-directsq.c dftw-generic.c dftw-genericbuf.c direct.c generic.c \
|
||||
indirect.c indirect-transpose.c kdft-dif.c kdft-difsq.c kdft-dit.c \
|
||||
kdft.c nop.c plan.c problem.c rader.c rank-geq2.c solve.c vrank-geq1.c \
|
||||
zero.c codelet-dft.h ct.h dft.h
|
||||
|
||||
all: all-recursive
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu dft/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
libdft.la: $(libdft_la_OBJECTS) $(libdft_la_DEPENDENCIES) $(EXTRA_libdft_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(libdft_la_OBJECTS) $(libdft_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bluestein.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-direct.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-directsq.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-generic.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-genericbuf.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/generic.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect-transpose.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-dif.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-difsq.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-dit.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rader.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/zero.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
# This directory's subdirectories are mostly independent; you can cd
|
||||
# into them and run 'make' without going through this Makefile.
|
||||
# To change the values of 'make' variables: instead of editing Makefiles,
|
||||
# (1) if the variable is set in 'config.status', edit 'config.status'
|
||||
# (which will cause the Makefiles to be regenerated when you run 'make');
|
||||
# (2) otherwise, pass the desired values on the 'make' command line.
|
||||
$(am__recursive_targets):
|
||||
@fail=; \
|
||||
if $(am__make_keepgoing); then \
|
||||
failcom='fail=yes'; \
|
||||
else \
|
||||
failcom='exit 1'; \
|
||||
fi; \
|
||||
dot_seen=no; \
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
case "$@" in \
|
||||
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
|
||||
*) list='$(SUBDIRS)' ;; \
|
||||
esac; \
|
||||
for subdir in $$list; do \
|
||||
echo "Making $$target in $$subdir"; \
|
||||
if test "$$subdir" = "."; then \
|
||||
dot_seen=yes; \
|
||||
local_target="$$target-am"; \
|
||||
else \
|
||||
local_target="$$target"; \
|
||||
fi; \
|
||||
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|
||||
|| eval $$failcom; \
|
||||
done; \
|
||||
if test "$$dot_seen" = "no"; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
|
||||
fi; test -z "$$fail"
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-recursive
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
|
||||
include_option=--etags-include; \
|
||||
empty_fix=.; \
|
||||
else \
|
||||
include_option=--include; \
|
||||
empty_fix=; \
|
||||
fi; \
|
||||
list='$(SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
test ! -f $$subdir/TAGS || \
|
||||
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
|
||||
fi; \
|
||||
done; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-recursive
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-recursive
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
$(am__make_dryrun) \
|
||||
|| test -d "$(distdir)/$$subdir" \
|
||||
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|
||||
|| exit 1; \
|
||||
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
|
||||
$(am__relativize); \
|
||||
new_distdir=$$reldir; \
|
||||
dir1=$$subdir; dir2="$(top_distdir)"; \
|
||||
$(am__relativize); \
|
||||
new_top_distdir=$$reldir; \
|
||||
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
|
||||
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
|
||||
($(am__cd) $$subdir && \
|
||||
$(MAKE) $(AM_MAKEFLAGS) \
|
||||
top_distdir="$$new_top_distdir" \
|
||||
distdir="$$new_distdir" \
|
||||
am__remove_distdir=: \
|
||||
am__skip_length_check=: \
|
||||
am__skip_mode_fix=: \
|
||||
distdir) \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-recursive
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs: installdirs-recursive
|
||||
installdirs-am:
|
||||
install: install-recursive
|
||||
install-exec: install-exec-recursive
|
||||
install-data: install-data-recursive
|
||||
uninstall: uninstall-recursive
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-recursive
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-recursive
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-recursive
|
||||
-rm -f ./$(DEPDIR)/bluestein.Plo
|
||||
-rm -f ./$(DEPDIR)/buffered.Plo
|
||||
-rm -f ./$(DEPDIR)/conf.Plo
|
||||
-rm -f ./$(DEPDIR)/ct.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-direct.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-directsq.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-generic.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-genericbuf.Plo
|
||||
-rm -f ./$(DEPDIR)/direct.Plo
|
||||
-rm -f ./$(DEPDIR)/generic.Plo
|
||||
-rm -f ./$(DEPDIR)/indirect-transpose.Plo
|
||||
-rm -f ./$(DEPDIR)/indirect.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft-dif.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft-difsq.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft-dit.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft.Plo
|
||||
-rm -f ./$(DEPDIR)/nop.Plo
|
||||
-rm -f ./$(DEPDIR)/plan.Plo
|
||||
-rm -f ./$(DEPDIR)/problem.Plo
|
||||
-rm -f ./$(DEPDIR)/rader.Plo
|
||||
-rm -f ./$(DEPDIR)/rank-geq2.Plo
|
||||
-rm -f ./$(DEPDIR)/solve.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
|
||||
-rm -f ./$(DEPDIR)/zero.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-recursive
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-recursive
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-recursive
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-recursive
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-recursive
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-recursive
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-recursive
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-recursive
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-recursive
|
||||
-rm -f ./$(DEPDIR)/bluestein.Plo
|
||||
-rm -f ./$(DEPDIR)/buffered.Plo
|
||||
-rm -f ./$(DEPDIR)/conf.Plo
|
||||
-rm -f ./$(DEPDIR)/ct.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-direct.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-directsq.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-generic.Plo
|
||||
-rm -f ./$(DEPDIR)/dftw-genericbuf.Plo
|
||||
-rm -f ./$(DEPDIR)/direct.Plo
|
||||
-rm -f ./$(DEPDIR)/generic.Plo
|
||||
-rm -f ./$(DEPDIR)/indirect-transpose.Plo
|
||||
-rm -f ./$(DEPDIR)/indirect.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft-dif.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft-difsq.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft-dit.Plo
|
||||
-rm -f ./$(DEPDIR)/kdft.Plo
|
||||
-rm -f ./$(DEPDIR)/nop.Plo
|
||||
-rm -f ./$(DEPDIR)/plan.Plo
|
||||
-rm -f ./$(DEPDIR)/problem.Plo
|
||||
-rm -f ./$(DEPDIR)/rader.Plo
|
||||
-rm -f ./$(DEPDIR)/rank-geq2.Plo
|
||||
-rm -f ./$(DEPDIR)/solve.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
|
||||
-rm -f ./$(DEPDIR)/zero.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-recursive
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-recursive
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-recursive
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: $(am__recursive_targets) install-am install-strip
|
||||
|
||||
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
|
||||
am--depfiles check check-am clean clean-generic clean-libtool \
|
||||
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
|
||||
distclean-compile distclean-generic distclean-libtool \
|
||||
distclean-tags distdir dvi dvi-am html html-am info info-am \
|
||||
install install-am install-data install-data-am install-dvi \
|
||||
install-dvi-am install-exec install-exec-am install-html \
|
||||
install-html-am install-info install-info-am install-man \
|
||||
install-pdf install-pdf-am install-ps install-ps-am \
|
||||
install-strip installcheck installcheck-am installdirs \
|
||||
installdirs-am maintainer-clean maintainer-clean-generic \
|
||||
mostlyclean mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
|
||||
uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
@@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
INT n; /* problem size */
|
||||
INT nb; /* size of convolution */
|
||||
R *w; /* lambda k . exp(2*pi*i*k^2/(2*n)) */
|
||||
R *W; /* DFT(w) */
|
||||
plan *cldf;
|
||||
INT is, os;
|
||||
} P;
|
||||
|
||||
static void bluestein_sequence(enum wakefulness wakefulness, INT n, R *w)
|
||||
{
|
||||
INT k, ksq, n2 = 2 * n;
|
||||
triggen *t = X(mktriggen)(wakefulness, n2);
|
||||
|
||||
ksq = 0;
|
||||
for (k = 0; k < n; ++k) {
|
||||
t->cexp(t, ksq, w+2*k);
|
||||
/* careful with overflow */
|
||||
ksq += 2*k + 1; while (ksq > n2) ksq -= n2;
|
||||
}
|
||||
|
||||
X(triggen_destroy)(t);
|
||||
}
|
||||
|
||||
static void mktwiddle(enum wakefulness wakefulness, P *p)
|
||||
{
|
||||
INT i;
|
||||
INT n = p->n, nb = p->nb;
|
||||
R *w, *W;
|
||||
E nbf = (E)nb;
|
||||
|
||||
p->w = w = (R *) MALLOC(2 * n * sizeof(R), TWIDDLES);
|
||||
p->W = W = (R *) MALLOC(2 * nb * sizeof(R), TWIDDLES);
|
||||
|
||||
bluestein_sequence(wakefulness, n, w);
|
||||
|
||||
for (i = 0; i < nb; ++i)
|
||||
W[2*i] = W[2*i+1] = K(0.0);
|
||||
|
||||
W[0] = w[0] / nbf;
|
||||
W[1] = w[1] / nbf;
|
||||
|
||||
for (i = 1; i < n; ++i) {
|
||||
W[2*i] = W[2*(nb-i)] = w[2*i] / nbf;
|
||||
W[2*i+1] = W[2*(nb-i)+1] = w[2*i+1] / nbf;
|
||||
}
|
||||
|
||||
{
|
||||
plan_dft *cldf = (plan_dft *)p->cldf;
|
||||
/* cldf must be awake */
|
||||
cldf->apply(p->cldf, W, W+1, W, W+1);
|
||||
}
|
||||
}
|
||||
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i, n = ego->n, nb = ego->nb, is = ego->is, os = ego->os;
|
||||
R *w = ego->w, *W = ego->W;
|
||||
R *b = (R *) MALLOC(2 * nb * sizeof(R), BUFFERS);
|
||||
|
||||
/* multiply input by conjugate bluestein sequence */
|
||||
for (i = 0; i < n; ++i) {
|
||||
E xr = ri[i*is], xi = ii[i*is];
|
||||
E wr = w[2*i], wi = w[2*i+1];
|
||||
b[2*i] = xr * wr + xi * wi;
|
||||
b[2*i+1] = xi * wr - xr * wi;
|
||||
}
|
||||
|
||||
for (; i < nb; ++i) b[2*i] = b[2*i+1] = K(0.0);
|
||||
|
||||
/* convolution: FFT */
|
||||
{
|
||||
plan_dft *cldf = (plan_dft *)ego->cldf;
|
||||
cldf->apply(ego->cldf, b, b+1, b, b+1);
|
||||
}
|
||||
|
||||
/* convolution: pointwise multiplication */
|
||||
for (i = 0; i < nb; ++i) {
|
||||
E xr = b[2*i], xi = b[2*i+1];
|
||||
E wr = W[2*i], wi = W[2*i+1];
|
||||
b[2*i] = xi * wr + xr * wi;
|
||||
b[2*i+1] = xr * wr - xi * wi;
|
||||
}
|
||||
|
||||
/* convolution: IFFT by FFT with real/imag input/output swapped */
|
||||
{
|
||||
plan_dft *cldf = (plan_dft *)ego->cldf;
|
||||
cldf->apply(ego->cldf, b, b+1, b, b+1);
|
||||
}
|
||||
|
||||
/* multiply output by conjugate bluestein sequence */
|
||||
for (i = 0; i < n; ++i) {
|
||||
E xi = b[2*i], xr = b[2*i+1];
|
||||
E wr = w[2*i], wi = w[2*i+1];
|
||||
ro[i*os] = xr * wr + xi * wi;
|
||||
io[i*os] = xi * wr - xr * wi;
|
||||
}
|
||||
|
||||
X(ifree)(b);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cldf, wakefulness);
|
||||
|
||||
switch (wakefulness) {
|
||||
case SLEEPY:
|
||||
X(ifree0)(ego->w); ego->w = 0;
|
||||
X(ifree0)(ego->W); ego->W = 0;
|
||||
break;
|
||||
default:
|
||||
A(!ego->w);
|
||||
mktwiddle(wakefulness, ego);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
UNUSED(ego);
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
/* FIXME: allow other sizes */
|
||||
&& X(is_prime)(p->sz->dims[0].n)
|
||||
|
||||
/* FIXME: avoid infinite recursion of bluestein with itself.
|
||||
This works because all factors in child problems are 2, 3, 5 */
|
||||
&& p->sz->dims[0].n > 16
|
||||
|
||||
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > BLUESTEIN_MAX_SLOW)
|
||||
);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldf);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *)ego_;
|
||||
p->print(p, "(dft-bluestein-%D/%D%(%p%))",
|
||||
ego->n, ego->nb, ego->cldf);
|
||||
}
|
||||
|
||||
static INT choose_transform_size(INT minsz)
|
||||
{
|
||||
while (!X(factors_into_small_primes)(minsz))
|
||||
++minsz;
|
||||
return minsz;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
P *pln;
|
||||
INT n, nb;
|
||||
plan *cldf = 0;
|
||||
R *buf = (R *) 0;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
n = p->sz->dims[0].n;
|
||||
nb = choose_transform_size(2 * n - 1);
|
||||
buf = (R *) MALLOC(2 * nb * sizeof(R), BUFFERS);
|
||||
|
||||
cldf = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(nb, 2, 2),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
buf, buf+1,
|
||||
buf, buf+1),
|
||||
NO_SLOW, 0, 0);
|
||||
if (!cldf) goto nada;
|
||||
|
||||
X(ifree)(buf);
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply);
|
||||
|
||||
pln->n = n;
|
||||
pln->nb = nb;
|
||||
pln->w = 0;
|
||||
pln->W = 0;
|
||||
pln->cldf = cldf;
|
||||
pln->is = p->sz->dims[0].is;
|
||||
pln->os = p->sz->dims[0].os;
|
||||
|
||||
X(ops_add)(&cldf->ops, &cldf->ops, &pln->super.super.ops);
|
||||
pln->super.super.ops.add += 4 * n + 2 * nb;
|
||||
pln->super.super.ops.mul += 8 * n + 4 * nb;
|
||||
pln->super.super.ops.other += 6 * (n + nb);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(ifree0)(buf);
|
||||
X(plan_destroy_internal)(cldf);
|
||||
return (plan *)0;
|
||||
}
|
||||
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_bluestein_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
@@ -0,0 +1,284 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
size_t maxnbuf_ndx;
|
||||
} S;
|
||||
|
||||
static const INT maxnbufs[] = { 8, 256 };
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
|
||||
plan *cld, *cldcpy, *cldrest;
|
||||
INT n, vl, nbuf, bufdist;
|
||||
INT ivs_by_nbuf, ovs_by_nbuf;
|
||||
INT roffset, ioffset;
|
||||
} P;
|
||||
|
||||
/* transform a vector input with the help of bufs */
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT nbuf = ego->nbuf;
|
||||
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);
|
||||
|
||||
plan_dft *cld = (plan_dft *) ego->cld;
|
||||
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
|
||||
plan_dft *cldrest;
|
||||
INT i, vl = ego->vl;
|
||||
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
|
||||
INT roffset = ego->roffset, ioffset = ego->ioffset;
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* transform to bufs: */
|
||||
cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
|
||||
ri += ivs_by_nbuf; ii += ivs_by_nbuf;
|
||||
|
||||
/* copy back */
|
||||
cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
|
||||
ro += ovs_by_nbuf; io += ovs_by_nbuf;
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_dft *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, ri, ii, ro, io);
|
||||
}
|
||||
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldcpy, wakefulness);
|
||||
X(plan_awake)(ego->cldrest, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldrest);
|
||||
X(plan_destroy_internal)(ego->cldcpy);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
|
||||
ego->n, ego->nbuf,
|
||||
ego->vl, ego->bufdist % ego->n,
|
||||
ego->cld, ego->cldcpy, ego->cldrest);
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
const iodim *d = p->sz->dims;
|
||||
|
||||
if (1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->rnk == 1
|
||||
) {
|
||||
INT vl, ivs, ovs;
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
|
||||
if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
|
||||
return 0;
|
||||
|
||||
/* if this solver is redundant, in the sense that a solver
|
||||
of lower index generates the same plan, then prune this
|
||||
solver */
|
||||
if (X(nbuf_redundant)(d[0].n, vl,
|
||||
ego->maxnbuf_ndx,
|
||||
maxnbufs, NELEM(maxnbufs)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
In principle, the buffered transforms might be useful
|
||||
when working out of place. However, in order to
|
||||
prevent infinite loops in the planner, we require
|
||||
that the output stride of the buffered transforms be
|
||||
greater than 2.
|
||||
*/
|
||||
if (p->ri != p->ro)
|
||||
return (d[0].os > 2);
|
||||
|
||||
/*
|
||||
* If the problem is in place, the input/output strides must
|
||||
* be the same or the whole thing must fit in the buffer.
|
||||
*/
|
||||
if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
|
||||
return 1;
|
||||
|
||||
if (/* fits into buffer: */
|
||||
((p->vecsz->rnk == 0)
|
||||
||
|
||||
(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
|
||||
maxnbufs[ego->maxnbuf_ndx])
|
||||
== p->vecsz->dims[0].n)))
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
if (NO_BUFFERINGP(plnr)) return 0;
|
||||
if (!applicable0(ego, p_, plnr)) return 0;
|
||||
|
||||
if (NO_UGLYP(plnr)) {
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
if (p->ri != p->ro) return 0;
|
||||
if (X(toobig)(p->sz->dims[0].n)) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
const S *ego = (const S *)ego_;
|
||||
plan *cld = (plan *) 0;
|
||||
plan *cldcpy = (plan *) 0;
|
||||
plan *cldrest = (plan *) 0;
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
R *bufs = (R *) 0;
|
||||
INT nbuf = 0, bufdist, n, vl;
|
||||
INT ivs, ovs, roffset, ioffset;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
goto nada;
|
||||
|
||||
n = X(tensor_sz)(p->sz);
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
|
||||
nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
|
||||
bufdist = X(bufdist)(n, vl);
|
||||
A(nbuf > 0);
|
||||
|
||||
/* attempt to keep real and imaginary part in the same order,
|
||||
so as to allow optimizations in the the copy plan */
|
||||
roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
|
||||
ioffset = 1 - roffset;
|
||||
|
||||
/* initial allocation for the purpose of planning */
|
||||
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS);
|
||||
|
||||
/* allow destruction of input if problem is in place */
|
||||
cld = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(n, p->sz->dims[0].is, 2),
|
||||
X(mktensor_1d)(nbuf, ivs, bufdist * 2),
|
||||
TAINT(p->ri, ivs * nbuf),
|
||||
TAINT(p->ii, ivs * nbuf),
|
||||
bufs + roffset,
|
||||
bufs + ioffset),
|
||||
0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
|
||||
if (!cld)
|
||||
goto nada;
|
||||
|
||||
/* copying back from the buffer is a rank-0 transform: */
|
||||
cldcpy = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_0d)(),
|
||||
X(mktensor_2d)(nbuf, bufdist * 2, ovs,
|
||||
n, 2, p->sz->dims[0].os),
|
||||
bufs + roffset,
|
||||
bufs + ioffset,
|
||||
TAINT(p->ro, ovs * nbuf),
|
||||
TAINT(p->io, ovs * nbuf)));
|
||||
if (!cldcpy)
|
||||
goto nada;
|
||||
|
||||
/* deallocate buffers, let apply() allocate them for real */
|
||||
X(ifree)(bufs);
|
||||
bufs = 0;
|
||||
|
||||
/* plan the leftover transforms (cldrest): */
|
||||
{
|
||||
INT id = ivs * (nbuf * (vl / nbuf));
|
||||
INT od = ovs * (nbuf * (vl / nbuf));
|
||||
cldrest = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(tensor_copy)(p->sz),
|
||||
X(mktensor_1d)(vl % nbuf, ivs, ovs),
|
||||
p->ri+id, p->ii+id, p->ro+od, p->io+od));
|
||||
}
|
||||
if (!cldrest)
|
||||
goto nada;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply);
|
||||
pln->cld = cld;
|
||||
pln->cldcpy = cldcpy;
|
||||
pln->cldrest = cldrest;
|
||||
pln->n = n;
|
||||
pln->vl = vl;
|
||||
pln->ivs_by_nbuf = ivs * nbuf;
|
||||
pln->ovs_by_nbuf = ovs * nbuf;
|
||||
pln->roffset = roffset;
|
||||
pln->ioffset = ioffset;
|
||||
|
||||
pln->nbuf = nbuf;
|
||||
pln->bufdist = bufdist;
|
||||
|
||||
{
|
||||
opcnt t;
|
||||
X(ops_add)(&cld->ops, &cldcpy->ops, &t);
|
||||
X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
|
||||
}
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(ifree0)(bufs);
|
||||
X(plan_destroy_internal)(cldrest);
|
||||
X(plan_destroy_internal)(cldcpy);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(size_t maxnbuf_ndx)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->maxnbuf_ndx = maxnbuf_ndx;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_buffered_register)(planner *p)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < NELEM(maxnbufs); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(i));
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* This header file must include every file or define every
|
||||
* type or macro which is required to compile a codelet.
|
||||
*/
|
||||
|
||||
#ifndef __DFT_CODELET_H__
|
||||
#define __DFT_CODELET_H__
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
/**************************************************************
|
||||
* types of codelets
|
||||
**************************************************************/
|
||||
|
||||
/* DFT codelets */
|
||||
typedef struct kdft_desc_s kdft_desc;
|
||||
|
||||
typedef struct {
|
||||
int (*okp)(
|
||||
const kdft_desc *desc,
|
||||
const R *ri, const R *ii, const R *ro, const R *io,
|
||||
INT is, INT os, INT vl, INT ivs, INT ovs,
|
||||
const planner *plnr);
|
||||
INT vl;
|
||||
} kdft_genus;
|
||||
|
||||
struct kdft_desc_s {
|
||||
INT sz; /* size of transform computed */
|
||||
const char *nam;
|
||||
opcnt ops;
|
||||
const kdft_genus *genus;
|
||||
INT is;
|
||||
INT os;
|
||||
INT ivs;
|
||||
INT ovs;
|
||||
};
|
||||
|
||||
typedef void (*kdft) (const R *ri, const R *ii, R *ro, R *io,
|
||||
stride is, stride os, INT vl, INT ivs, INT ovs);
|
||||
void X(kdft_register)(planner *p, kdft codelet, const kdft_desc *desc);
|
||||
|
||||
|
||||
typedef struct ct_desc_s ct_desc;
|
||||
|
||||
typedef struct {
|
||||
int (*okp)(
|
||||
const struct ct_desc_s *desc,
|
||||
const R *rio, const R *iio,
|
||||
INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
||||
const planner *plnr);
|
||||
INT vl;
|
||||
} ct_genus;
|
||||
|
||||
struct ct_desc_s {
|
||||
INT radix;
|
||||
const char *nam;
|
||||
const tw_instr *tw;
|
||||
const ct_genus *genus;
|
||||
opcnt ops;
|
||||
INT rs;
|
||||
INT vs;
|
||||
INT ms;
|
||||
};
|
||||
|
||||
typedef void (*kdftw) (R *rioarray, R *iioarray, const R *W,
|
||||
stride ios, INT mb, INT me, INT ms);
|
||||
void X(kdft_dit_register)(planner *p, kdftw codelet, const ct_desc *desc);
|
||||
void X(kdft_dif_register)(planner *p, kdftw codelet, const ct_desc *desc);
|
||||
|
||||
|
||||
typedef void (*kdftwsq) (R *rioarray, R *iioarray,
|
||||
const R *W, stride is, stride vs,
|
||||
INT mb, INT me, INT ms);
|
||||
void X(kdft_difsq_register)(planner *p, kdftwsq codelet, const ct_desc *desc);
|
||||
|
||||
|
||||
extern const solvtab X(solvtab_dft_standard);
|
||||
extern const solvtab X(solvtab_dft_sse2);
|
||||
extern const solvtab X(solvtab_dft_avx);
|
||||
extern const solvtab X(solvtab_dft_avx_128_fma);
|
||||
extern const solvtab X(solvtab_dft_avx2);
|
||||
extern const solvtab X(solvtab_dft_avx2_128);
|
||||
extern const solvtab X(solvtab_dft_avx512);
|
||||
extern const solvtab X(solvtab_dft_kcvi);
|
||||
extern const solvtab X(solvtab_dft_altivec);
|
||||
extern const solvtab X(solvtab_dft_vsx);
|
||||
extern const solvtab X(solvtab_dft_neon);
|
||||
extern const solvtab X(solvtab_dft_generic_simd128);
|
||||
extern const solvtab X(solvtab_dft_generic_simd256);
|
||||
|
||||
#endif /* __DFT_CODELET_H__ */
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
static const solvtab s =
|
||||
{
|
||||
SOLVTAB(X(dft_indirect_register)),
|
||||
SOLVTAB(X(dft_indirect_transpose_register)),
|
||||
SOLVTAB(X(dft_rank_geq2_register)),
|
||||
SOLVTAB(X(dft_vrank_geq1_register)),
|
||||
SOLVTAB(X(dft_buffered_register)),
|
||||
SOLVTAB(X(dft_generic_register)),
|
||||
SOLVTAB(X(dft_rader_register)),
|
||||
SOLVTAB(X(dft_bluestein_register)),
|
||||
SOLVTAB(X(dft_nop_register)),
|
||||
SOLVTAB(X(ct_generic_register)),
|
||||
SOLVTAB(X(ct_genericbuf_register)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
|
||||
void X(dft_conf_standard)(planner *p)
|
||||
{
|
||||
X(solvtab_exec)(s, p);
|
||||
X(solvtab_exec)(X(solvtab_dft_standard), p);
|
||||
#if HAVE_SSE2
|
||||
if (X(have_simd_sse2)())
|
||||
X(solvtab_exec)(X(solvtab_dft_sse2), p);
|
||||
#endif
|
||||
#if HAVE_AVX
|
||||
if (X(have_simd_avx)())
|
||||
X(solvtab_exec)(X(solvtab_dft_avx), p);
|
||||
#endif
|
||||
#if HAVE_AVX_128_FMA
|
||||
if (X(have_simd_avx_128_fma)())
|
||||
X(solvtab_exec)(X(solvtab_dft_avx_128_fma), p);
|
||||
#endif
|
||||
#if HAVE_AVX2
|
||||
if (X(have_simd_avx2)())
|
||||
X(solvtab_exec)(X(solvtab_dft_avx2), p);
|
||||
if (X(have_simd_avx2_128)())
|
||||
X(solvtab_exec)(X(solvtab_dft_avx2_128), p);
|
||||
#endif
|
||||
#if HAVE_AVX512
|
||||
if (X(have_simd_avx512)())
|
||||
X(solvtab_exec)(X(solvtab_dft_avx512), p);
|
||||
#endif
|
||||
#if HAVE_KCVI
|
||||
if (X(have_simd_kcvi)())
|
||||
X(solvtab_exec)(X(solvtab_dft_kcvi), p);
|
||||
#endif
|
||||
#if HAVE_ALTIVEC
|
||||
if (X(have_simd_altivec)())
|
||||
X(solvtab_exec)(X(solvtab_dft_altivec), p);
|
||||
#endif
|
||||
#if HAVE_VSX
|
||||
if (X(have_simd_vsx)())
|
||||
X(solvtab_exec)(X(solvtab_dft_vsx), p);
|
||||
#endif
|
||||
#if HAVE_NEON
|
||||
if (X(have_simd_neon)())
|
||||
X(solvtab_exec)(X(solvtab_dft_neon), p);
|
||||
#endif
|
||||
#if HAVE_GENERIC_SIMD128
|
||||
X(solvtab_exec)(X(solvtab_dft_generic_simd128), p);
|
||||
#endif
|
||||
#if HAVE_GENERIC_SIMD256
|
||||
X(solvtab_exec)(X(solvtab_dft_generic_simd256), p);
|
||||
#endif
|
||||
}
|
||||
@@ -0,0 +1,255 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
ct_solver *(*X(mksolver_ct_hook))(size_t, INT, int,
|
||||
ct_mkinferior, ct_force_vrecursion) = 0;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
plan *cld;
|
||||
plan *cldw;
|
||||
INT r;
|
||||
} P;
|
||||
|
||||
static void apply_dit(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
plan_dftw *cldw;
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, ri, ii, ro, io);
|
||||
|
||||
cldw = (plan_dftw *) ego->cldw;
|
||||
cldw->apply(ego->cldw, ro, io);
|
||||
}
|
||||
|
||||
static void apply_dif(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
plan_dftw *cldw;
|
||||
|
||||
cldw = (plan_dftw *) ego->cldw;
|
||||
cldw->apply(ego->cldw, ri, ii);
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, ri, ii, ro, io);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldw, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldw);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(dft-ct-%s/%D%(%p%)%(%p%))",
|
||||
ego->super.apply == apply_dit ? "dit" : "dif",
|
||||
ego->r, ego->cldw, ego->cld);
|
||||
}
|
||||
|
||||
static int applicable0(const ct_solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
INT r;
|
||||
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
|
||||
/* DIF destroys the input and we don't like it */
|
||||
&& (ego->dec == DECDIT ||
|
||||
p->ri == p->ro ||
|
||||
!NO_DESTROY_INPUTP(plnr))
|
||||
|
||||
&& ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 1)
|
||||
&& p->sz->dims[0].n > r);
|
||||
}
|
||||
|
||||
|
||||
int X(ct_applicable)(const ct_solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p;
|
||||
|
||||
if (!applicable0(ego, p_, plnr))
|
||||
return 0;
|
||||
|
||||
p = (const problem_dft *) p_;
|
||||
|
||||
return (0
|
||||
|| ego->dec == DECDIF+TRANSPOSE
|
||||
|| p->vecsz->rnk == 0
|
||||
|| !NO_VRECURSEP(plnr)
|
||||
|| (ego->force_vrecursionp && ego->force_vrecursionp(ego, p))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const ct_solver *ego = (const ct_solver *) ego_;
|
||||
const problem_dft *p;
|
||||
P *pln = 0;
|
||||
plan *cld = 0, *cldw = 0;
|
||||
INT n, r, m, v, ivs, ovs;
|
||||
iodim *d;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if ((NO_NONTHREADEDP(plnr)) || !X(ct_applicable)(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_dft *) p_;
|
||||
d = p->sz->dims;
|
||||
n = d[0].n;
|
||||
r = X(choose_radix)(ego->r, n);
|
||||
m = n / r;
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
|
||||
|
||||
switch (ego->dec) {
|
||||
case DECDIT:
|
||||
{
|
||||
cldw = ego->mkcldw(ego,
|
||||
r, m * d[0].os, m * d[0].os,
|
||||
m, d[0].os,
|
||||
v, ovs, ovs,
|
||||
0, m,
|
||||
p->ro, p->io, plnr);
|
||||
if (!cldw) goto nada;
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(m, r * d[0].is, d[0].os),
|
||||
X(mktensor_2d)(r, d[0].is, m * d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->ri, p->ii, p->ro, p->io)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply_dit);
|
||||
break;
|
||||
}
|
||||
case DECDIF:
|
||||
case DECDIF+TRANSPOSE:
|
||||
{
|
||||
INT cors, covs; /* cldw ors, ovs */
|
||||
if (ego->dec == DECDIF+TRANSPOSE) {
|
||||
cors = ivs;
|
||||
covs = m * d[0].is;
|
||||
/* ensure that we generate well-formed dftw subproblems */
|
||||
/* FIXME: too conservative */
|
||||
if (!(1
|
||||
&& r == v
|
||||
&& d[0].is == r * cors))
|
||||
goto nada;
|
||||
|
||||
/* FIXME: allow in-place only for now, like in
|
||||
fftw-3.[01] */
|
||||
if (!(1
|
||||
&& p->ri == p->ro
|
||||
&& d[0].is == r * d[0].os
|
||||
&& cors == d[0].os
|
||||
&& covs == ovs
|
||||
))
|
||||
goto nada;
|
||||
} else {
|
||||
cors = m * d[0].is;
|
||||
covs = ivs;
|
||||
}
|
||||
|
||||
cldw = ego->mkcldw(ego,
|
||||
r, m * d[0].is, cors,
|
||||
m, d[0].is,
|
||||
v, ivs, covs,
|
||||
0, m,
|
||||
p->ri, p->ii, plnr);
|
||||
if (!cldw) goto nada;
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(m, d[0].is, r * d[0].os),
|
||||
X(mktensor_2d)(r, cors, d[0].os,
|
||||
v, covs, ovs),
|
||||
p->ri, p->ii, p->ro, p->io)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply_dif);
|
||||
break;
|
||||
}
|
||||
|
||||
default: A(0);
|
||||
|
||||
}
|
||||
|
||||
pln->cld = cld;
|
||||
pln->cldw = cldw;
|
||||
pln->r = r;
|
||||
X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
|
||||
|
||||
/* inherit could_prune_now_p attribute from cldw */
|
||||
pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldw);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
ct_solver *X(mksolver_ct)(size_t size, INT r, int dec,
|
||||
ct_mkinferior mkcldw,
|
||||
ct_force_vrecursion force_vrecursionp)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
ct_solver *slv = (ct_solver *)X(mksolver)(size, &sadt);
|
||||
slv->r = r;
|
||||
slv->dec = dec;
|
||||
slv->mkcldw = mkcldw;
|
||||
slv->force_vrecursionp = force_vrecursionp;
|
||||
return slv;
|
||||
}
|
||||
|
||||
plan *X(mkplan_dftw)(size_t size, const plan_adt *adt, dftwapply apply)
|
||||
{
|
||||
plan_dftw *ego;
|
||||
|
||||
ego = (plan_dftw *) X(mkplan)(size, adt);
|
||||
ego->apply = apply;
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef void (*dftwapply)(const plan *ego, R *rio, R *iio);
|
||||
typedef struct ct_solver_s ct_solver;
|
||||
typedef plan *(*ct_mkinferior)(const ct_solver *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mstart, INT mcount,
|
||||
R *rio, R *iio, planner *plnr);
|
||||
typedef int (*ct_force_vrecursion)(const ct_solver *ego,
|
||||
const problem_dft *p);
|
||||
|
||||
typedef struct {
|
||||
plan super;
|
||||
dftwapply apply;
|
||||
} plan_dftw;
|
||||
|
||||
extern plan *X(mkplan_dftw)(size_t size, const plan_adt *adt, dftwapply apply);
|
||||
|
||||
#define MKPLAN_DFTW(type, adt, apply) \
|
||||
(type *)X(mkplan_dftw)(sizeof(type), adt, apply)
|
||||
|
||||
struct ct_solver_s {
|
||||
solver super;
|
||||
INT r;
|
||||
int dec;
|
||||
# define DECDIF 0
|
||||
# define DECDIT 1
|
||||
# define TRANSPOSE 2
|
||||
ct_mkinferior mkcldw;
|
||||
ct_force_vrecursion force_vrecursionp;
|
||||
};
|
||||
|
||||
int X(ct_applicable)(const ct_solver *, const problem *, planner *);
|
||||
ct_solver *X(mksolver_ct)(size_t size, INT r, int dec,
|
||||
ct_mkinferior mkcldw,
|
||||
ct_force_vrecursion force_vrecursionp);
|
||||
extern ct_solver *(*X(mksolver_ct_hook))(size_t, INT, int,
|
||||
ct_mkinferior, ct_force_vrecursion);
|
||||
|
||||
void X(regsolver_ct_directw)(planner *plnr,
|
||||
kdftw codelet, const ct_desc *desc, int dec);
|
||||
void X(regsolver_ct_directwbuf)(planner *plnr,
|
||||
kdftw codelet, const ct_desc *desc, int dec);
|
||||
solver *X(mksolver_ctsq)(kdftwsq codelet, const ct_desc *desc, int dec);
|
||||
void X(regsolver_ct_directwsq)(planner *plnr, kdftwsq codelet,
|
||||
const ct_desc *desc, int dec);
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __DFT_H__
|
||||
#define __DFT_H__
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* problem.c: */
|
||||
typedef struct {
|
||||
problem super;
|
||||
tensor *sz, *vecsz;
|
||||
R *ri, *ii, *ro, *io;
|
||||
} problem_dft;
|
||||
|
||||
void X(dft_zerotens)(tensor *sz, R *ri, R *ii);
|
||||
problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz,
|
||||
R *ri, R *ii, R *ro, R *io);
|
||||
problem *X(mkproblem_dft_d)(tensor *sz, tensor *vecsz,
|
||||
R *ri, R *ii, R *ro, R *io);
|
||||
|
||||
/* solve.c: */
|
||||
void X(dft_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plan.c: */
|
||||
typedef void (*dftapply) (const plan *ego, R *ri, R *ii, R *ro, R *io);
|
||||
|
||||
typedef struct {
|
||||
plan super;
|
||||
dftapply apply;
|
||||
} plan_dft;
|
||||
|
||||
plan *X(mkplan_dft)(size_t size, const plan_adt *adt, dftapply apply);
|
||||
|
||||
#define MKPLAN_DFT(type, adt, apply) \
|
||||
(type *)X(mkplan_dft)(sizeof(type), adt, apply)
|
||||
|
||||
/* various solvers */
|
||||
solver *X(mksolver_dft_direct)(kdft k, const kdft_desc *desc);
|
||||
solver *X(mksolver_dft_directbuf)(kdft k, const kdft_desc *desc);
|
||||
|
||||
void X(dft_rank0_register)(planner *p);
|
||||
void X(dft_rank_geq2_register)(planner *p);
|
||||
void X(dft_indirect_register)(planner *p);
|
||||
void X(dft_indirect_transpose_register)(planner *p);
|
||||
void X(dft_vrank_geq1_register)(planner *p);
|
||||
void X(dft_vrank2_transpose_register)(planner *p);
|
||||
void X(dft_vrank3_transpose_register)(planner *p);
|
||||
void X(dft_buffered_register)(planner *p);
|
||||
void X(dft_generic_register)(planner *p);
|
||||
void X(dft_rader_register)(planner *p);
|
||||
void X(dft_bluestein_register)(planner *p);
|
||||
void X(dft_nop_register)(planner *p);
|
||||
void X(ct_generic_register)(planner *p);
|
||||
void X(ct_genericbuf_register)(planner *p);
|
||||
|
||||
/* configurations */
|
||||
void X(dft_conf_standard)(planner *p);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* __DFT_H__ */
|
||||
@@ -0,0 +1,332 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
typedef struct {
|
||||
ct_solver super;
|
||||
const ct_desc *desc;
|
||||
int bufferedp;
|
||||
kdftw k;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dftw super;
|
||||
kdftw k;
|
||||
INT r;
|
||||
stride rs;
|
||||
INT m, ms, v, vs, mb, me, extra_iter;
|
||||
stride brs;
|
||||
twid *td;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
|
||||
/*************************************************************
|
||||
Nonbuffered code
|
||||
*************************************************************/
|
||||
static void apply(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
for (i = 0; i < ego->v; ++i, rio += ego->vs, iio += ego->vs) {
|
||||
INT mb = ego->mb, ms = ego->ms;
|
||||
ego->k(rio + mb*ms, iio + mb*ms, ego->td->W,
|
||||
ego->rs, mb, ego->me, ms);
|
||||
}
|
||||
}
|
||||
|
||||
static void apply_extra_iter(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i, v = ego->v, vs = ego->vs;
|
||||
INT mb = ego->mb, me = ego->me, mm = me - 1, ms = ego->ms;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
for (i = 0; i < v; ++i, rio += vs, iio += vs) {
|
||||
ego->k(rio + mb*ms, iio + mb*ms, ego->td->W,
|
||||
ego->rs, mb, mm, ms);
|
||||
ego->k(rio + mm*ms, iio + mm*ms, ego->td->W,
|
||||
ego->rs, mm, mm+2, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
Buffered code
|
||||
*************************************************************/
|
||||
static void dobatch(const P *ego, R *rA, R *iA, INT mb, INT me, R *buf)
|
||||
{
|
||||
INT brs = WS(ego->brs, 1);
|
||||
INT rs = WS(ego->rs, 1);
|
||||
INT ms = ego->ms;
|
||||
|
||||
X(cpy2d_pair_ci)(rA + mb*ms, iA + mb*ms, buf, buf + 1,
|
||||
ego->r, rs, brs,
|
||||
me - mb, ms, 2);
|
||||
ego->k(buf, buf + 1, ego->td->W, ego->brs, mb, me, 2);
|
||||
X(cpy2d_pair_co)(buf, buf + 1, rA + mb*ms, iA + mb*ms,
|
||||
ego->r, brs, rs,
|
||||
me - mb, 2, ms);
|
||||
}
|
||||
|
||||
/* must be even for SIMD alignment; should not be 2^k to avoid
|
||||
associativity conflicts */
|
||||
static INT compute_batchsize(INT radix)
|
||||
{
|
||||
/* round up to multiple of 4 */
|
||||
radix += 3;
|
||||
radix &= -4;
|
||||
|
||||
return (radix + 2);
|
||||
}
|
||||
|
||||
static void apply_buf(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i, j, v = ego->v, r = ego->r;
|
||||
INT batchsz = compute_batchsize(r);
|
||||
R *buf;
|
||||
INT mb = ego->mb, me = ego->me;
|
||||
size_t bufsz = r * batchsz * 2 * sizeof(R);
|
||||
|
||||
BUF_ALLOC(R *, buf, bufsz);
|
||||
|
||||
for (i = 0; i < v; ++i, rio += ego->vs, iio += ego->vs) {
|
||||
for (j = mb; j + batchsz < me; j += batchsz)
|
||||
dobatch(ego, rio, iio, j, j + batchsz, buf);
|
||||
|
||||
dobatch(ego, rio, iio, j, me, buf);
|
||||
}
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
common code
|
||||
*************************************************************/
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
|
||||
ego->r * ego->m, ego->r, ego->m + ego->extra_iter);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(stride_destroy)(ego->brs);
|
||||
X(stride_destroy)(ego->rs);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *slv = ego->slv;
|
||||
const ct_desc *e = slv->desc;
|
||||
|
||||
if (slv->bufferedp)
|
||||
p->print(p, "(dftw-directbuf/%D-%D/%D%v \"%s\")",
|
||||
compute_batchsize(ego->r), ego->r,
|
||||
X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
|
||||
else
|
||||
p->print(p, "(dftw-direct-%D/%D%v \"%s\")",
|
||||
ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mb, INT me,
|
||||
R *rio, R *iio,
|
||||
const planner *plnr, INT *extra_iter)
|
||||
{
|
||||
const ct_desc *e = ego->desc;
|
||||
UNUSED(v);
|
||||
|
||||
return (
|
||||
1
|
||||
&& r == e->radix
|
||||
&& irs == ors /* in-place along R */
|
||||
&& ivs == ovs /* in-place along V */
|
||||
|
||||
/* check for alignment/vector length restrictions */
|
||||
&& ((*extra_iter = 0,
|
||||
e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr))
|
||||
||
|
||||
(*extra_iter = 1,
|
||||
(1
|
||||
/* FIXME: require full array, otherwise some threads
|
||||
may be extra_iter and other threads won't be.
|
||||
Generating the proper twiddle factors is a pain in
|
||||
this case */
|
||||
&& mb == 0 && me == m
|
||||
&& e->genus->okp(e, rio, iio, irs, ivs,
|
||||
m, mb, me - 1, ms, plnr)
|
||||
&& e->genus->okp(e, rio, iio, irs, ivs,
|
||||
m, me - 1, me + 1, ms, plnr))))
|
||||
|
||||
&& (e->genus->okp(e, rio + ivs, iio + ivs, irs, ivs,
|
||||
m, mb, me - *extra_iter, ms, plnr))
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable0_buf(const S *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mb, INT me,
|
||||
R *rio, R *iio,
|
||||
const planner *plnr)
|
||||
{
|
||||
const ct_desc *e = ego->desc;
|
||||
INT batchsz;
|
||||
UNUSED(v); UNUSED(ms); UNUSED(rio); UNUSED(iio);
|
||||
|
||||
return (
|
||||
1
|
||||
&& r == e->radix
|
||||
&& irs == ors /* in-place along R */
|
||||
&& ivs == ovs /* in-place along V */
|
||||
|
||||
/* check for alignment/vector length restrictions, both for
|
||||
batchsize and for the remainder */
|
||||
&& (batchsz = compute_batchsize(r), 1)
|
||||
&& (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
|
||||
m, mb, mb + batchsz, 2, plnr))
|
||||
&& (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
|
||||
m, mb, me, 2, plnr))
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mb, INT me,
|
||||
R *rio, R *iio,
|
||||
const planner *plnr, INT *extra_iter)
|
||||
{
|
||||
if (ego->bufferedp) {
|
||||
*extra_iter = 0;
|
||||
if (!applicable0_buf(ego,
|
||||
r, irs, ors, m, ms, v, ivs, ovs, mb, me,
|
||||
rio, iio, plnr))
|
||||
return 0;
|
||||
} else {
|
||||
if (!applicable0(ego,
|
||||
r, irs, ors, m, ms, v, ivs, ovs, mb, me,
|
||||
rio, iio, plnr, extra_iter))
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
|
||||
v, m * r, r))
|
||||
return 0;
|
||||
|
||||
if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkcldw(const ct_solver *ego_,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mstart, INT mcount,
|
||||
R *rio, R *iio,
|
||||
planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const ct_desc *e = ego->desc;
|
||||
INT extra_iter;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
A(mstart >= 0 && mstart + mcount <= m);
|
||||
if (!applicable(ego,
|
||||
r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
|
||||
rio, iio, plnr, &extra_iter))
|
||||
return (plan *)0;
|
||||
|
||||
if (ego->bufferedp) {
|
||||
pln = MKPLAN_DFTW(P, &padt, apply_buf);
|
||||
} else {
|
||||
pln = MKPLAN_DFTW(P, &padt, extra_iter ? apply_extra_iter : apply);
|
||||
}
|
||||
|
||||
pln->k = ego->k;
|
||||
pln->rs = X(mkstride)(r, irs);
|
||||
pln->td = 0;
|
||||
pln->r = r;
|
||||
pln->m = m;
|
||||
pln->ms = ms;
|
||||
pln->v = v;
|
||||
pln->vs = ivs;
|
||||
pln->mb = mstart;
|
||||
pln->me = mstart + mcount;
|
||||
pln->slv = ego;
|
||||
pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
|
||||
pln->extra_iter = extra_iter;
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(v * (mcount/e->genus->vl), &e->ops, &pln->super.super.ops);
|
||||
|
||||
if (ego->bufferedp) {
|
||||
/* 8 load/stores * N * V */
|
||||
pln->super.super.ops.other += 8 * r * mcount * v;
|
||||
}
|
||||
|
||||
pln->super.super.could_prune_now_p =
|
||||
(!ego->bufferedp && r >= 5 && r < 64 && m >= r);
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static void regone(planner *plnr, kdftw codelet,
|
||||
const ct_desc *desc, int dec, int bufferedp)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
if (X(mksolver_ct_hook)) {
|
||||
slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix,
|
||||
dec, mkcldw, 0);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
}
|
||||
}
|
||||
|
||||
void X(regsolver_ct_directw)(planner *plnr, kdftw codelet,
|
||||
const ct_desc *desc, int dec)
|
||||
{
|
||||
regone(plnr, codelet, desc, dec, /* bufferedp */ 0);
|
||||
regone(plnr, codelet, desc, dec, /* bufferedp */ 1);
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
typedef struct {
|
||||
ct_solver super;
|
||||
const ct_desc *desc;
|
||||
kdftwsq k;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dftw super;
|
||||
kdftwsq k;
|
||||
INT r;
|
||||
stride rs, vs;
|
||||
INT m, ms, v, mb, me;
|
||||
twid *td;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
|
||||
static void apply(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT mb = ego->mb, ms = ego->ms;
|
||||
ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, ego->rs, ego->vs,
|
||||
mb, ego->me, ms);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
|
||||
ego->r * ego->m, ego->r, ego->m);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(stride_destroy)(ego->rs);
|
||||
X(stride_destroy)(ego->vs);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *slv = ego->slv;
|
||||
const ct_desc *e = slv->desc;
|
||||
|
||||
p->print(p, "(dftw-directsq-%D/%D%v \"%s\")",
|
||||
ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mb, INT me,
|
||||
R *rio, R *iio,
|
||||
const planner *plnr)
|
||||
{
|
||||
const ct_desc *e = ego->desc;
|
||||
UNUSED(v);
|
||||
|
||||
return (
|
||||
1
|
||||
&& r == e->radix
|
||||
|
||||
/* transpose r, v */
|
||||
&& r == v
|
||||
&& irs == ovs
|
||||
&& ivs == ors
|
||||
|
||||
/* check for alignment/vector length restrictions */
|
||||
&& e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr)
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkcldw(const ct_solver *ego_,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mstart, INT mcount,
|
||||
R *rio, R *iio,
|
||||
planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const ct_desc *e = ego->desc;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
A(mstart >= 0 && mstart + mcount <= m);
|
||||
if (!applicable(ego,
|
||||
r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
|
||||
rio, iio, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
pln = MKPLAN_DFTW(P, &padt, apply);
|
||||
|
||||
pln->k = ego->k;
|
||||
pln->rs = X(mkstride)(r, irs);
|
||||
pln->vs = X(mkstride)(v, ivs);
|
||||
pln->td = 0;
|
||||
pln->r = r;
|
||||
pln->m = m;
|
||||
pln->ms = ms;
|
||||
pln->v = v;
|
||||
pln->mb = mstart;
|
||||
pln->me = mstart + mcount;
|
||||
pln->slv = ego;
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(mcount/e->genus->vl, &e->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static void regone(planner *plnr, kdftwsq codelet,
|
||||
const ct_desc *desc, int dec)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
if (X(mksolver_ct_hook)) {
|
||||
slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix, dec,
|
||||
mkcldw, 0);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
}
|
||||
}
|
||||
|
||||
void X(regsolver_ct_directwsq)(planner *plnr, kdftwsq codelet,
|
||||
const ct_desc *desc, int dec)
|
||||
{
|
||||
regone(plnr, codelet, desc, dec+TRANSPOSE);
|
||||
}
|
||||
@@ -0,0 +1,204 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* express a twiddle problem in terms of dft + multiplication by
|
||||
twiddle factors */
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
typedef ct_solver S;
|
||||
|
||||
typedef struct {
|
||||
plan_dftw super;
|
||||
|
||||
INT r, rs, m, mb, me, ms, v, vs;
|
||||
|
||||
plan *cld;
|
||||
|
||||
twid *td;
|
||||
|
||||
const S *slv;
|
||||
int dec;
|
||||
} P;
|
||||
|
||||
static void mktwiddle(P *ego, enum wakefulness wakefulness)
|
||||
{
|
||||
static const tw_instr tw[] = { { TW_FULL, 0, 0 }, { TW_NEXT, 1, 0 } };
|
||||
|
||||
/* note that R and M are swapped, to allow for sequential
|
||||
access both to data and twiddles */
|
||||
X(twiddle_awake)(wakefulness, &ego->td, tw,
|
||||
ego->r * ego->m, ego->m, ego->r);
|
||||
}
|
||||
|
||||
static void bytwiddle(const P *ego, R *rio, R *iio)
|
||||
{
|
||||
INT iv, ir, im;
|
||||
INT r = ego->r, rs = ego->rs;
|
||||
INT m = ego->m, mb = ego->mb, me = ego->me, ms = ego->ms;
|
||||
INT v = ego->v, vs = ego->vs;
|
||||
const R *W = ego->td->W;
|
||||
|
||||
mb += (mb == 0); /* skip m=0 iteration */
|
||||
for (iv = 0; iv < v; ++iv) {
|
||||
for (ir = 1; ir < r; ++ir) {
|
||||
for (im = mb; im < me; ++im) {
|
||||
R *pr = rio + ms * im + rs * ir;
|
||||
R *pi = iio + ms * im + rs * ir;
|
||||
E xr = *pr;
|
||||
E xi = *pi;
|
||||
E wr = W[2 * im + (2 * (m-1)) * ir - 2];
|
||||
E wi = W[2 * im + (2 * (m-1)) * ir - 1];
|
||||
*pr = xr * wr + xi * wi;
|
||||
*pi = xi * wr - xr * wi;
|
||||
}
|
||||
}
|
||||
rio += vs;
|
||||
iio += vs;
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable(INT irs, INT ors, INT ivs, INT ovs,
|
||||
const planner *plnr)
|
||||
{
|
||||
return (1
|
||||
&& irs == ors
|
||||
&& ivs == ovs
|
||||
&& !NO_SLOWP(plnr)
|
||||
);
|
||||
}
|
||||
|
||||
static void apply_dit(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
INT dm = ego->ms * ego->mb;
|
||||
|
||||
bytwiddle(ego, rio, iio);
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, rio + dm, iio + dm, rio + dm, iio + dm);
|
||||
}
|
||||
|
||||
static void apply_dif(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
INT dm = ego->ms * ego->mb;
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, rio + dm, iio + dm, rio + dm, iio + dm);
|
||||
|
||||
bytwiddle(ego, rio, iio);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
mktwiddle(ego, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(dftw-generic-%s-%D-%D%v%(%p%))",
|
||||
ego->dec == DECDIT ? "dit" : "dif",
|
||||
ego->r, ego->m, ego->v, ego->cld);
|
||||
}
|
||||
|
||||
static plan *mkcldw(const ct_solver *ego_,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mstart, INT mcount,
|
||||
R *rio, R *iio,
|
||||
planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *)ego_;
|
||||
P *pln;
|
||||
plan *cld = 0;
|
||||
INT dm = ms * mstart;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
A(mstart >= 0 && mstart + mcount <= m);
|
||||
if (!applicable(irs, ors, ivs, ovs, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(r, irs, irs),
|
||||
X(mktensor_2d)(mcount, ms, ms, v, ivs, ivs),
|
||||
rio + dm, iio + dm, rio + dm, iio + dm)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_DFTW(P, &padt, ego->dec == DECDIT ? apply_dit : apply_dif);
|
||||
pln->slv = ego;
|
||||
pln->cld = cld;
|
||||
pln->r = r;
|
||||
pln->rs = irs;
|
||||
pln->m = m;
|
||||
pln->ms = ms;
|
||||
pln->v = v;
|
||||
pln->vs = ivs;
|
||||
pln->mb = mstart;
|
||||
pln->me = mstart + mcount;
|
||||
pln->dec = ego->dec;
|
||||
pln->td = 0;
|
||||
|
||||
{
|
||||
double n0 = (r - 1) * (mcount - 1) * v;
|
||||
pln->super.super.ops = cld->ops;
|
||||
pln->super.super.ops.mul += 8 * n0;
|
||||
pln->super.super.ops.add += 4 * n0;
|
||||
pln->super.super.ops.other += 8 * n0;
|
||||
}
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static void regsolver(planner *plnr, INT r, int dec)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_ct)(sizeof(S), r, dec, mkcldw, 0);
|
||||
REGISTER_SOLVER(plnr, &(slv->super));
|
||||
if (X(mksolver_ct_hook)) {
|
||||
slv = (S *)X(mksolver_ct_hook)(sizeof(S), r, dec, mkcldw, 0);
|
||||
REGISTER_SOLVER(plnr, &(slv->super));
|
||||
}
|
||||
}
|
||||
|
||||
void X(ct_generic_register)(planner *p)
|
||||
{
|
||||
regsolver(p, 0, DECDIT);
|
||||
regsolver(p, 0, DECDIF);
|
||||
}
|
||||
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* express a twiddle problem in terms of dft + multiplication by
|
||||
twiddle factors */
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
typedef struct {
|
||||
ct_solver super;
|
||||
INT batchsz;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dftw super;
|
||||
|
||||
INT r, rs, m, ms, v, vs, mb, me;
|
||||
INT batchsz;
|
||||
plan *cld;
|
||||
|
||||
triggen *t;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
|
||||
#define BATCHDIST(r) ((r) + 16)
|
||||
|
||||
/**************************************************************/
|
||||
static void bytwiddle(const P *ego, INT mb, INT me, R *buf, R *rio, R *iio)
|
||||
{
|
||||
INT j, k;
|
||||
INT r = ego->r, rs = ego->rs, ms = ego->ms;
|
||||
triggen *t = ego->t;
|
||||
for (j = 0; j < r; ++j) {
|
||||
for (k = mb; k < me; ++k)
|
||||
t->rotate(t, j * k,
|
||||
rio[j * rs + k * ms],
|
||||
iio[j * rs + k * ms],
|
||||
&buf[j * 2 + 2 * BATCHDIST(r) * (k - mb) + 0]);
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT v,
|
||||
INT mcount)
|
||||
{
|
||||
return (1
|
||||
&& v == 1
|
||||
&& irs == ors
|
||||
&& mcount >= ego->batchsz
|
||||
&& mcount % ego->batchsz == 0
|
||||
&& r >= 64
|
||||
&& m >= r
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT v,
|
||||
INT mcount,
|
||||
const planner *plnr)
|
||||
{
|
||||
if (!applicable0(ego, r, irs, ors, m, v, mcount))
|
||||
return 0;
|
||||
if (NO_UGLYP(plnr) && m * r < 65536)
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void dobatch(const P *ego, INT mb, INT me, R *buf, R *rio, R *iio)
|
||||
{
|
||||
plan_dft *cld;
|
||||
INT ms = ego->ms;
|
||||
|
||||
bytwiddle(ego, mb, me, buf, rio, iio);
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, buf, buf + 1, buf, buf + 1);
|
||||
X(cpy2d_pair_co)(buf, buf + 1,
|
||||
rio + ms * mb, iio + ms * mb,
|
||||
me-mb, 2 * BATCHDIST(ego->r), ms,
|
||||
ego->r, 2, ego->rs);
|
||||
}
|
||||
|
||||
static void apply(const plan *ego_, R *rio, R *iio)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
R *buf = (R *) MALLOC(sizeof(R) * 2 * BATCHDIST(ego->r) * ego->batchsz,
|
||||
BUFFERS);
|
||||
INT m;
|
||||
|
||||
for (m = ego->mb; m < ego->me; m += ego->batchsz)
|
||||
dobatch(ego, m, m + ego->batchsz, buf, rio, iio);
|
||||
|
||||
A(m == ego->me);
|
||||
|
||||
X(ifree)(buf);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
|
||||
switch (wakefulness) {
|
||||
case SLEEPY:
|
||||
X(triggen_destroy)(ego->t); ego->t = 0;
|
||||
break;
|
||||
default:
|
||||
ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(dftw-genericbuf/%D-%D-%D%(%p%))",
|
||||
ego->batchsz, ego->r, ego->m, ego->cld);
|
||||
}
|
||||
|
||||
static plan *mkcldw(const ct_solver *ego_,
|
||||
INT r, INT irs, INT ors,
|
||||
INT m, INT ms,
|
||||
INT v, INT ivs, INT ovs,
|
||||
INT mstart, INT mcount,
|
||||
R *rio, R *iio,
|
||||
planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *)ego_;
|
||||
P *pln;
|
||||
plan *cld = 0;
|
||||
R *buf;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ivs); UNUSED(ovs); UNUSED(rio); UNUSED(iio);
|
||||
|
||||
A(mstart >= 0 && mstart + mcount <= m);
|
||||
if (!applicable(ego, r, irs, ors, m, v, mcount, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
buf = (R *) MALLOC(sizeof(R) * 2 * BATCHDIST(r) * ego->batchsz, BUFFERS);
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(r, 2, 2),
|
||||
X(mktensor_1d)(ego->batchsz,
|
||||
2 * BATCHDIST(r),
|
||||
2 * BATCHDIST(r)),
|
||||
buf, buf + 1, buf, buf + 1
|
||||
)
|
||||
);
|
||||
X(ifree)(buf);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_DFTW(P, &padt, apply);
|
||||
pln->slv = ego;
|
||||
pln->cld = cld;
|
||||
pln->r = r;
|
||||
pln->m = m;
|
||||
pln->ms = ms;
|
||||
pln->rs = irs;
|
||||
pln->batchsz = ego->batchsz;
|
||||
pln->mb = mstart;
|
||||
pln->me = mstart + mcount;
|
||||
|
||||
{
|
||||
double n0 = (r - 1) * (mcount - 1);
|
||||
pln->super.super.ops = cld->ops;
|
||||
pln->super.super.ops.mul += 8 * n0;
|
||||
pln->super.super.ops.add += 4 * n0;
|
||||
pln->super.super.ops.other += 8 * n0;
|
||||
}
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static void regsolver(planner *plnr, INT r, INT batchsz)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_ct)(sizeof(S), r, DECDIT, mkcldw, 0);
|
||||
slv->batchsz = batchsz;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
|
||||
if (X(mksolver_ct_hook)) {
|
||||
slv = (S *)X(mksolver_ct_hook)(sizeof(S), r, DECDIT, mkcldw, 0);
|
||||
slv->batchsz = batchsz;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void X(ct_genericbuf_register)(planner *p)
|
||||
{
|
||||
static const INT radices[] = { -1, -2, -4, -8, -16, -32, -64 };
|
||||
static const INT batchsizes[] = { 4, 8, 16, 32, 64 };
|
||||
unsigned i, j;
|
||||
|
||||
for (i = 0; i < sizeof(radices) / sizeof(radices[0]); ++i)
|
||||
for (j = 0; j < sizeof(batchsizes) / sizeof(batchsizes[0]); ++j)
|
||||
regsolver(p, radices[i], batchsizes[j]);
|
||||
}
|
||||
@@ -0,0 +1,293 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* direct DFT solver, if we have a codelet */
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
const kdft_desc *desc;
|
||||
kdft k;
|
||||
int bufferedp;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
|
||||
stride is, os, bufstride;
|
||||
INT n, vl, ivs, ovs;
|
||||
kdft k;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
static void dobatch(const P *ego, R *ri, R *ii, R *ro, R *io,
|
||||
R *buf, INT batchsz)
|
||||
{
|
||||
X(cpy2d_pair_ci)(ri, ii, buf, buf+1,
|
||||
ego->n, WS(ego->is, 1), WS(ego->bufstride, 1),
|
||||
batchsz, ego->ivs, 2);
|
||||
|
||||
if (IABS(WS(ego->os, 1)) < IABS(ego->ovs)) {
|
||||
/* transform directly to output */
|
||||
ego->k(buf, buf+1, ro, io,
|
||||
ego->bufstride, ego->os, batchsz, 2, ego->ovs);
|
||||
} else {
|
||||
/* transform to buffer and copy back */
|
||||
ego->k(buf, buf+1, buf, buf+1,
|
||||
ego->bufstride, ego->bufstride, batchsz, 2, 2);
|
||||
X(cpy2d_pair_co)(buf, buf+1, ro, io,
|
||||
ego->n, WS(ego->bufstride, 1), WS(ego->os, 1),
|
||||
batchsz, 2, ego->ovs);
|
||||
}
|
||||
}
|
||||
|
||||
static INT compute_batchsize(INT n)
|
||||
{
|
||||
/* round up to multiple of 4 */
|
||||
n += 3;
|
||||
n &= -4;
|
||||
|
||||
return (n + 2);
|
||||
}
|
||||
|
||||
static void apply_buf(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
R *buf;
|
||||
INT vl = ego->vl, n = ego->n, batchsz = compute_batchsize(n);
|
||||
INT i;
|
||||
size_t bufsz = n * batchsz * 2 * sizeof(R);
|
||||
|
||||
BUF_ALLOC(R *, buf, bufsz);
|
||||
|
||||
for (i = 0; i < vl - batchsz; i += batchsz) {
|
||||
dobatch(ego, ri, ii, ro, io, buf, batchsz);
|
||||
ri += batchsz * ego->ivs; ii += batchsz * ego->ivs;
|
||||
ro += batchsz * ego->ovs; io += batchsz * ego->ovs;
|
||||
}
|
||||
dobatch(ego, ri, ii, ro, io, buf, vl - i);
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
ego->k(ri, ii, ro, io, ego->is, ego->os, ego->vl, ego->ivs, ego->ovs);
|
||||
}
|
||||
|
||||
static void apply_extra_iter(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT vl = ego->vl;
|
||||
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
|
||||
/* for 4-way SIMD when VL is odd: iterate over an
|
||||
even vector length VL, and then execute the last
|
||||
iteration as a 2-vector with vector stride 0. */
|
||||
ego->k(ri, ii, ro, io, ego->is, ego->os, vl - 1, ego->ivs, ego->ovs);
|
||||
|
||||
ego->k(ri + (vl - 1) * ego->ivs, ii + (vl - 1) * ego->ivs,
|
||||
ro + (vl - 1) * ego->ovs, io + (vl - 1) * ego->ovs,
|
||||
ego->is, ego->os, 1, 0, 0);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(stride_destroy)(ego->is);
|
||||
X(stride_destroy)(ego->os);
|
||||
X(stride_destroy)(ego->bufstride);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->slv;
|
||||
const kdft_desc *d = s->desc;
|
||||
|
||||
if (ego->slv->bufferedp)
|
||||
p->print(p, "(dft-directbuf/%D-%D%v \"%s\")",
|
||||
compute_batchsize(d->sz), d->sz, ego->vl, d->nam);
|
||||
else
|
||||
p->print(p, "(dft-direct-%D%v \"%s\")", d->sz, ego->vl, d->nam);
|
||||
}
|
||||
|
||||
static int applicable_buf(const solver *ego_, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
const kdft_desc *d = ego->desc;
|
||||
INT vl;
|
||||
INT ivs, ovs;
|
||||
INT batchsz;
|
||||
|
||||
return (
|
||||
1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 1
|
||||
&& p->sz->dims[0].n == d->sz
|
||||
|
||||
/* check strides etc */
|
||||
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
|
||||
|
||||
/* UGLY if IS <= IVS */
|
||||
&& !(NO_UGLYP(plnr) &&
|
||||
X(iabs)(p->sz->dims[0].is) <= X(iabs)(ivs))
|
||||
|
||||
&& (batchsz = compute_batchsize(d->sz), 1)
|
||||
&& (d->genus->okp(d, 0, ((const R *)0) + 1, p->ro, p->io,
|
||||
2 * batchsz, p->sz->dims[0].os,
|
||||
batchsz, 2, ovs, plnr))
|
||||
&& (d->genus->okp(d, 0, ((const R *)0) + 1, p->ro, p->io,
|
||||
2 * batchsz, p->sz->dims[0].os,
|
||||
vl % batchsz, 2, ovs, plnr))
|
||||
|
||||
|
||||
&& (0
|
||||
/* can operate out-of-place */
|
||||
|| p->ri != p->ro
|
||||
|
||||
/* can operate in-place as long as strides are the same */
|
||||
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
|
||||
|
||||
/* can do it if the problem fits in the buffer, no matter
|
||||
what the strides are */
|
||||
|| vl <= batchsz
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr, int *extra_iterp)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
const kdft_desc *d = ego->desc;
|
||||
INT vl;
|
||||
INT ivs, ovs;
|
||||
|
||||
return (
|
||||
1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->dims[0].n == d->sz
|
||||
|
||||
/* check strides etc */
|
||||
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
|
||||
|
||||
&& ((*extra_iterp = 0,
|
||||
(d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
|
||||
p->sz->dims[0].is, p->sz->dims[0].os,
|
||||
vl, ivs, ovs, plnr)))
|
||||
||
|
||||
(*extra_iterp = 1,
|
||||
((d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
|
||||
p->sz->dims[0].is, p->sz->dims[0].os,
|
||||
vl - 1, ivs, ovs, plnr))
|
||||
&&
|
||||
(d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
|
||||
p->sz->dims[0].is, p->sz->dims[0].os,
|
||||
2, 0, 0, plnr)))))
|
||||
|
||||
&& (0
|
||||
/* can operate out-of-place */
|
||||
|| p->ri != p->ro
|
||||
|
||||
/* can always compute one transform */
|
||||
|| vl == 1
|
||||
|
||||
/* can operate in-place as long as strides are the same */
|
||||
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const problem_dft *p;
|
||||
iodim *d;
|
||||
const kdft_desc *e = ego->desc;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), X(null_awake), print, destroy
|
||||
};
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (ego->bufferedp) {
|
||||
if (!applicable_buf(ego_, p_, plnr))
|
||||
return (plan *)0;
|
||||
pln = MKPLAN_DFT(P, &padt, apply_buf);
|
||||
} else {
|
||||
int extra_iterp = 0;
|
||||
if (!applicable(ego_, p_, plnr, &extra_iterp))
|
||||
return (plan *)0;
|
||||
pln = MKPLAN_DFT(P, &padt, extra_iterp ? apply_extra_iter : apply);
|
||||
}
|
||||
|
||||
p = (const problem_dft *) p_;
|
||||
d = p->sz->dims;
|
||||
pln->k = ego->k;
|
||||
pln->n = d[0].n;
|
||||
pln->is = X(mkstride)(pln->n, d[0].is);
|
||||
pln->os = X(mkstride)(pln->n, d[0].os);
|
||||
pln->bufstride = X(mkstride)(pln->n, 2 * compute_batchsize(pln->n));
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
|
||||
pln->slv = ego;
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(pln->vl / e->genus->vl, &e->ops, &pln->super.super.ops);
|
||||
|
||||
if (ego->bufferedp)
|
||||
pln->super.super.ops.other += 4 * pln->n * pln->vl;
|
||||
|
||||
pln->super.super.could_prune_now_p = !ego->bufferedp;
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(kdft k, const kdft_desc *desc, int bufferedp)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->k = k;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
solver *X(mksolver_dft_direct)(kdft k, const kdft_desc *desc)
|
||||
{
|
||||
return mksolver(k, desc, 0);
|
||||
}
|
||||
|
||||
solver *X(mksolver_dft_directbuf)(kdft k, const kdft_desc *desc)
|
||||
{
|
||||
return mksolver(k, desc, 1);
|
||||
}
|
||||
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
twid *td;
|
||||
INT n, is, os;
|
||||
} P;
|
||||
|
||||
|
||||
static void cdot(INT n, const E *x, const R *w,
|
||||
R *or0, R *oi0, R *or1, R *oi1)
|
||||
{
|
||||
INT i;
|
||||
|
||||
E rr = x[0], ri = 0, ir = x[1], ii = 0;
|
||||
x += 2;
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
rr += x[0] * w[0];
|
||||
ir += x[1] * w[0];
|
||||
ri += x[2] * w[1];
|
||||
ii += x[3] * w[1];
|
||||
x += 4; w += 2;
|
||||
}
|
||||
*or0 = rr + ii;
|
||||
*oi0 = ir - ri;
|
||||
*or1 = rr - ii;
|
||||
*oi1 = ir + ri;
|
||||
}
|
||||
|
||||
static void hartley(INT n, const R *xr, const R *xi, INT xs, E *o,
|
||||
R *pr, R *pi)
|
||||
{
|
||||
INT i;
|
||||
E sr, si;
|
||||
o[0] = sr = xr[0]; o[1] = si = xi[0]; o += 2;
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
sr += (o[0] = xr[i * xs] + xr[(n - i) * xs]);
|
||||
si += (o[1] = xi[i * xs] + xi[(n - i) * xs]);
|
||||
o[2] = xr[i * xs] - xr[(n - i) * xs];
|
||||
o[3] = xi[i * xs] - xi[(n - i) * xs];
|
||||
o += 4;
|
||||
}
|
||||
*pr = sr;
|
||||
*pi = si;
|
||||
}
|
||||
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i;
|
||||
INT n = ego->n, is = ego->is, os = ego->os;
|
||||
const R *W = ego->td->W;
|
||||
E *buf;
|
||||
size_t bufsz = n * 2 * sizeof(E);
|
||||
|
||||
BUF_ALLOC(E *, buf, bufsz);
|
||||
hartley(n, ri, ii, is, buf, ro, io);
|
||||
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
cdot(n, buf, W,
|
||||
ro + i * os, io + i * os,
|
||||
ro + (n - i) * os, io + (n - i) * os);
|
||||
W += n - 1;
|
||||
}
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
static const tw_instr half_tw[] = {
|
||||
{ TW_HALF, 1, 0 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n,
|
||||
(ego->n - 1) / 2);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
p->print(p, "(dft-generic-%D)", ego->n);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
UNUSED(ego);
|
||||
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
&& (p->sz->dims[0].n % 2) == 1
|
||||
&& CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
|
||||
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
|
||||
&& X(is_prime)(p->sz->dims[0].n)
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p;
|
||||
P *pln;
|
||||
INT n;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, X(plan_null_destroy)
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply);
|
||||
|
||||
p = (const problem_dft *) p_;
|
||||
pln->n = n = p->sz->dims[0].n;
|
||||
pln->is = p->sz->dims[0].is;
|
||||
pln->os = p->sz->dims[0].os;
|
||||
pln->td = 0;
|
||||
|
||||
pln->super.super.ops.add = (n-1) * 5;
|
||||
pln->super.super.ops.mul = 0;
|
||||
pln->super.super.ops.fma = (n-1) * (n-1) ;
|
||||
#if 0 /* these are nice pipelined sequential loads and should cost nothing */
|
||||
pln->super.super.ops.other = (n-1)*(4 + 1 + 2 * (n-1)); /* approximate */
|
||||
#endif
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_generic_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* solvers/plans for vectors of DFTs corresponding to the columns
|
||||
of a matrix: first transpose the matrix so that the DFTs are
|
||||
contiguous, then do DFTs with transposed output. In particular,
|
||||
we restrict ourselves to the case of a square transpose (or a
|
||||
sequence thereof). */
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef solver S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
INT vl, ivs, ovs;
|
||||
plan *cldtrans, *cld, *cldrest;
|
||||
} P;
|
||||
|
||||
/* initial transpose is out-of-place from input to output */
|
||||
static void apply_op(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT vl = ego->vl, ivs = ego->ivs, ovs = ego->ovs, i;
|
||||
|
||||
for (i = 0; i < vl; ++i) {
|
||||
{
|
||||
plan_dft *cldtrans = (plan_dft *) ego->cldtrans;
|
||||
cldtrans->apply(ego->cldtrans, ri, ii, ro, io);
|
||||
}
|
||||
{
|
||||
plan_dft *cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, ro, io, ro, io);
|
||||
}
|
||||
ri += ivs; ii += ivs;
|
||||
ro += ovs; io += ovs;
|
||||
}
|
||||
{
|
||||
plan_dft *cldrest = (plan_dft *) ego->cldrest;
|
||||
cldrest->apply(ego->cldrest, ri, ii, ro, io);
|
||||
}
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldrest);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
X(plan_destroy_internal)(ego->cldtrans);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldtrans, wakefulness);
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldrest, wakefulness);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(indirect-transpose%v%(%p%)%(%p%)%(%p%))",
|
||||
ego->vl, ego->cldtrans, ego->cld, ego->cldrest);
|
||||
}
|
||||
|
||||
static int pickdim(const tensor *vs, const tensor *s, int *pdim0, int *pdim1)
|
||||
{
|
||||
int dim0, dim1;
|
||||
*pdim0 = *pdim1 = -1;
|
||||
for (dim0 = 0; dim0 < vs->rnk; ++dim0)
|
||||
for (dim1 = 0; dim1 < s->rnk; ++dim1)
|
||||
if (vs->dims[dim0].n * X(iabs)(vs->dims[dim0].is) <= X(iabs)(s->dims[dim1].is)
|
||||
&& vs->dims[dim0].n >= s->dims[dim1].n
|
||||
&& (*pdim0 == -1
|
||||
|| (X(iabs)(vs->dims[dim0].is) <= X(iabs)(vs->dims[*pdim0].is)
|
||||
&& X(iabs)(s->dims[dim1].is) >= X(iabs)(s->dims[*pdim1].is)))) {
|
||||
*pdim0 = dim0;
|
||||
*pdim1 = dim1;
|
||||
}
|
||||
return (*pdim0 != -1 && *pdim1 != -1);
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_,
|
||||
const planner *plnr,
|
||||
int *pdim0, int *pdim1)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
UNUSED(ego_); UNUSED(plnr);
|
||||
|
||||
return (1
|
||||
&& FINITE_RNK(p->vecsz->rnk) && FINITE_RNK(p->sz->rnk)
|
||||
|
||||
/* FIXME: can/should we relax this constraint? */
|
||||
&& X(tensor_inplace_strides2)(p->vecsz, p->sz)
|
||||
|
||||
&& pickdim(p->vecsz, p->sz, pdim0, pdim1)
|
||||
|
||||
/* output should not *already* include the transpose
|
||||
(in which case we duplicate the regular indirect.c) */
|
||||
&& (p->sz->dims[*pdim1].os != p->vecsz->dims[*pdim0].is)
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr,
|
||||
int *pdim0, int *pdim1)
|
||||
{
|
||||
if (!applicable0(ego_, p_, plnr, pdim0, pdim1)) return 0;
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
INT u = p->ri == p->ii + 1 || p->ii == p->ri + 1 ? (INT)2 : (INT)1;
|
||||
|
||||
/* UGLY if does not result in contiguous transforms or
|
||||
transforms of contiguous vectors (since the latter at
|
||||
least have efficient transpositions) */
|
||||
if (NO_UGLYP(plnr)
|
||||
&& p->vecsz->dims[*pdim0].is != u
|
||||
&& !(p->vecsz->rnk == 2
|
||||
&& p->vecsz->dims[1-*pdim0].is == u
|
||||
&& p->vecsz->dims[*pdim0].is
|
||||
== u * p->vecsz->dims[1-*pdim0].n))
|
||||
return 0;
|
||||
|
||||
if (NO_INDIRECT_OP_P(plnr) && p->ri != p->ro) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
P *pln;
|
||||
plan *cld = 0, *cldtrans = 0, *cldrest = 0;
|
||||
int pdim0, pdim1;
|
||||
tensor *ts, *tv;
|
||||
INT vl, ivs, ovs;
|
||||
R *rit, *iit, *rot, *iot;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr, &pdim0, &pdim1))
|
||||
return (plan *) 0;
|
||||
|
||||
vl = p->vecsz->dims[pdim0].n / p->sz->dims[pdim1].n;
|
||||
A(vl >= 1);
|
||||
ivs = p->sz->dims[pdim1].n * p->vecsz->dims[pdim0].is;
|
||||
ovs = p->sz->dims[pdim1].n * p->vecsz->dims[pdim0].os;
|
||||
rit = TAINT(p->ri, vl == 1 ? 0 : ivs);
|
||||
iit = TAINT(p->ii, vl == 1 ? 0 : ivs);
|
||||
rot = TAINT(p->ro, vl == 1 ? 0 : ovs);
|
||||
iot = TAINT(p->io, vl == 1 ? 0 : ovs);
|
||||
|
||||
ts = X(tensor_copy_inplace)(p->sz, INPLACE_IS);
|
||||
ts->dims[pdim1].os = p->vecsz->dims[pdim0].is;
|
||||
tv = X(tensor_copy_inplace)(p->vecsz, INPLACE_IS);
|
||||
tv->dims[pdim0].os = p->sz->dims[pdim1].is;
|
||||
tv->dims[pdim0].n = p->sz->dims[pdim1].n;
|
||||
cldtrans = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_0d)(),
|
||||
X(tensor_append)(tv, ts),
|
||||
rit, iit,
|
||||
rot, iot));
|
||||
X(tensor_destroy2)(ts, tv);
|
||||
if (!cldtrans) goto nada;
|
||||
|
||||
ts = X(tensor_copy)(p->sz);
|
||||
ts->dims[pdim1].is = p->vecsz->dims[pdim0].is;
|
||||
tv = X(tensor_copy)(p->vecsz);
|
||||
tv->dims[pdim0].is = p->sz->dims[pdim1].is;
|
||||
tv->dims[pdim0].n = p->sz->dims[pdim1].n;
|
||||
cld = X(mkplan_d)(plnr, X(mkproblem_dft_d)(ts, tv,
|
||||
rot, iot,
|
||||
rot, iot));
|
||||
if (!cld) goto nada;
|
||||
|
||||
tv = X(tensor_copy)(p->vecsz);
|
||||
tv->dims[pdim0].n -= vl * p->sz->dims[pdim1].n;
|
||||
cldrest = X(mkplan_d)(plnr, X(mkproblem_dft_d)(X(tensor_copy)(p->sz), tv,
|
||||
p->ri + ivs * vl,
|
||||
p->ii + ivs * vl,
|
||||
p->ro + ovs * vl,
|
||||
p->io + ovs * vl));
|
||||
if (!cldrest) goto nada;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply_op);
|
||||
pln->cldtrans = cldtrans;
|
||||
pln->cld = cld;
|
||||
pln->cldrest = cldrest;
|
||||
pln->vl = vl;
|
||||
pln->ivs = ivs;
|
||||
pln->ovs = ovs;
|
||||
X(ops_cpy)(&cldrest->ops, &pln->super.super.ops);
|
||||
X(ops_madd2)(vl, &cld->ops, &pln->super.super.ops);
|
||||
X(ops_madd2)(vl, &cldtrans->ops, &pln->super.super.ops);
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldrest);
|
||||
X(plan_destroy_internal)(cld);
|
||||
X(plan_destroy_internal)(cldtrans);
|
||||
return (plan *)0;
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return slv;
|
||||
}
|
||||
|
||||
void X(dft_indirect_transpose_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/* solvers/plans for vectors of small DFT's that cannot be done
|
||||
in-place directly. Use a rank-0 plan to rearrange the data
|
||||
before or after the transform. Can also change an out-of-place
|
||||
plan into a copy + in-place (where the in-place transform
|
||||
is e.g. unit stride). */
|
||||
|
||||
/* FIXME: merge with rank-geq2.c(?), since this is just a special case
|
||||
of a rank split where the first/second transform has rank 0. */
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef problem *(*mkcld_t) (const problem_dft *p);
|
||||
|
||||
typedef struct {
|
||||
dftapply apply;
|
||||
problem *(*mkcld)(const problem_dft *p);
|
||||
const char *nam;
|
||||
} ndrct_adt;
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
const ndrct_adt *adt;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
plan *cldcpy, *cld;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* first rearrange, then transform */
|
||||
static void apply_before(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
{
|
||||
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
|
||||
cldcpy->apply(ego->cldcpy, ri, ii, ro, io);
|
||||
}
|
||||
{
|
||||
plan_dft *cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, ro, io, ro, io);
|
||||
}
|
||||
}
|
||||
|
||||
static problem *mkcld_before(const problem_dft *p)
|
||||
{
|
||||
return X(mkproblem_dft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_OS),
|
||||
X(tensor_copy_inplace)(p->vecsz, INPLACE_OS),
|
||||
p->ro, p->io, p->ro, p->io);
|
||||
}
|
||||
|
||||
static const ndrct_adt adt_before =
|
||||
{
|
||||
apply_before, mkcld_before, "dft-indirect-before"
|
||||
};
|
||||
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* first transform, then rearrange */
|
||||
|
||||
static void apply_after(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
{
|
||||
plan_dft *cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, ri, ii, ri, ii);
|
||||
}
|
||||
{
|
||||
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
|
||||
cldcpy->apply(ego->cldcpy, ri, ii, ro, io);
|
||||
}
|
||||
}
|
||||
|
||||
static problem *mkcld_after(const problem_dft *p)
|
||||
{
|
||||
return X(mkproblem_dft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_IS),
|
||||
X(tensor_copy_inplace)(p->vecsz, INPLACE_IS),
|
||||
p->ri, p->ii, p->ri, p->ii);
|
||||
}
|
||||
|
||||
static const ndrct_adt adt_after =
|
||||
{
|
||||
apply_after, mkcld_after, "dft-indirect-after"
|
||||
};
|
||||
|
||||
/*-----------------------------------------------------------------------*/
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
X(plan_destroy_internal)(ego->cldcpy);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldcpy, wakefulness);
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->slv;
|
||||
p->print(p, "(%s%(%p%)%(%p%))", s->adt->nam, ego->cld, ego->cldcpy);
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
return (1
|
||||
&& FINITE_RNK(p->vecsz->rnk)
|
||||
|
||||
/* problem must be a nontrivial transform, not just a copy */
|
||||
&& p->sz->rnk > 0
|
||||
|
||||
&& (0
|
||||
|
||||
/* problem must be in-place & require some
|
||||
rearrangement of the data; to prevent
|
||||
infinite loops with indirect-transpose, we
|
||||
further require that at least some transform
|
||||
strides must decrease */
|
||||
|| (p->ri == p->ro
|
||||
&& !X(tensor_inplace_strides2)(p->sz, p->vecsz)
|
||||
&& X(tensor_strides_decrease)(
|
||||
p->sz, p->vecsz,
|
||||
ego->adt->apply == apply_after ?
|
||||
INPLACE_IS : INPLACE_OS))
|
||||
|
||||
/* or problem must be out of place, transforming
|
||||
from stride 1/2 to bigger stride, for apply_after */
|
||||
|| (p->ri != p->ro && ego->adt->apply == apply_after
|
||||
&& !NO_DESTROY_INPUTP(plnr)
|
||||
&& X(tensor_min_istride)(p->sz) <= 2
|
||||
&& X(tensor_min_ostride)(p->sz) > 2)
|
||||
|
||||
/* or problem must be out of place, transforming
|
||||
to stride 1/2 from bigger stride, for apply_before */
|
||||
|| (p->ri != p->ro && ego->adt->apply == apply_before
|
||||
&& X(tensor_min_ostride)(p->sz) <= 2
|
||||
&& X(tensor_min_istride)(p->sz) > 2)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
if (!applicable0(ego_, p_, plnr)) return 0;
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
if (NO_INDIRECT_OP_P(plnr) && p->ri != p->ro) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
plan *cld = 0, *cldcpy = 0;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
cldcpy =
|
||||
X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_0d)(),
|
||||
X(tensor_append)(p->vecsz, p->sz),
|
||||
p->ri, p->ii, p->ro, p->io));
|
||||
|
||||
if (!cldcpy) goto nada;
|
||||
|
||||
cld = X(mkplan_f_d)(plnr, ego->adt->mkcld(p), NO_BUFFERING, 0, 0);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, ego->adt->apply);
|
||||
pln->cld = cld;
|
||||
pln->cldcpy = cldcpy;
|
||||
pln->slv = ego;
|
||||
X(ops_add)(&cld->ops, &cldcpy->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld);
|
||||
X(plan_destroy_internal)(cldcpy);
|
||||
return (plan *)0;
|
||||
}
|
||||
|
||||
static solver *mksolver(const ndrct_adt *adt)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->adt = adt;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_indirect_register)(planner *p)
|
||||
{
|
||||
unsigned i;
|
||||
static const ndrct_adt *const adts[] = {
|
||||
&adt_before, &adt_after
|
||||
};
|
||||
|
||||
for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(adts[i]));
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
void X(kdft_dif_register)(planner *p, kdftw codelet, const ct_desc *desc)
|
||||
{
|
||||
X(regsolver_ct_directw)(p, codelet, desc, DECDIF);
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
void X(kdft_difsq_register)(planner *p, kdftwsq k, const ct_desc *desc)
|
||||
{
|
||||
X(regsolver_ct_directwsq)(p, k, desc, DECDIF);
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/ct.h"
|
||||
|
||||
void X(kdft_dit_register)(planner *p, kdftw codelet, const ct_desc *desc)
|
||||
{
|
||||
X(regsolver_ct_directw)(p, codelet, desc, DECDIT);
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
void X(kdft_register)(planner *p, kdft codelet, const kdft_desc *desc)
|
||||
{
|
||||
REGISTER_SOLVER(p, X(mksolver_dft_direct)(codelet, desc));
|
||||
REGISTER_SOLVER(p, X(mksolver_dft_directbuf)(codelet, desc));
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for vrank -infty DFTs (nothing to do) */
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
UNUSED(ego_);
|
||||
UNUSED(ri);
|
||||
UNUSED(ii);
|
||||
UNUSED(ro);
|
||||
UNUSED(io);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
|
||||
UNUSED(ego_);
|
||||
|
||||
return 0
|
||||
/* case 1 : -infty vector rank */
|
||||
|| (!FINITE_RNK(p->vecsz->rnk))
|
||||
|
||||
/* case 2 : rank-0 in-place dft */
|
||||
|| (1
|
||||
&& p->sz->rnk == 0
|
||||
&& FINITE_RNK(p->vecsz->rnk)
|
||||
&& p->ro == p->ri
|
||||
&& X(tensor_inplace_strides)(p->vecsz)
|
||||
);
|
||||
}
|
||||
|
||||
static void print(const plan *ego, printer *p)
|
||||
{
|
||||
UNUSED(ego);
|
||||
p->print(p, "(dft-nop)");
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
|
||||
{
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), X(null_awake), print, X(plan_null_destroy)
|
||||
};
|
||||
plan_dft *pln;
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (!applicable(ego, p))
|
||||
return (plan *) 0;
|
||||
pln = MKPLAN_DFT(plan_dft, &padt, apply);
|
||||
X(ops_zero)(&pln->super.ops);
|
||||
|
||||
return &(pln->super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
return MKSOLVER(solver, &sadt);
|
||||
}
|
||||
|
||||
void X(dft_nop_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
plan *X(mkplan_dft)(size_t size, const plan_adt *adt, dftapply apply)
|
||||
{
|
||||
plan_dft *ego;
|
||||
|
||||
ego = (plan_dft *) X(mkplan)(size, adt);
|
||||
ego->apply = apply;
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/dft.h"
|
||||
#include <stddef.h>
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_dft *ego = (problem_dft *) ego_;
|
||||
X(tensor_destroy2)(ego->vecsz, ego->sz);
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
X(md5puts)(m, "dft");
|
||||
X(md5int)(m, p->ri == p->ro);
|
||||
X(md5INT)(m, p->ii - p->ri);
|
||||
X(md5INT)(m, p->io - p->ro);
|
||||
X(md5int)(m, X(ialignment_of)(p->ri));
|
||||
X(md5int)(m, X(ialignment_of)(p->ii));
|
||||
X(md5int)(m, X(ialignment_of)(p->ro));
|
||||
X(md5int)(m, X(ialignment_of)(p->io));
|
||||
X(tensor_md5)(m, p->sz);
|
||||
X(tensor_md5)(m, p->vecsz);
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_dft *ego = (const problem_dft *) ego_;
|
||||
p->print(p, "(dft %d %d %d %D %D %T %T)",
|
||||
ego->ri == ego->ro,
|
||||
X(ialignment_of)(ego->ri),
|
||||
X(ialignment_of)(ego->ro),
|
||||
(INT)(ego->ii - ego->ri),
|
||||
(INT)(ego->io - ego->ro),
|
||||
ego->sz,
|
||||
ego->vecsz);
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_dft *ego = (const problem_dft *) ego_;
|
||||
tensor *sz = X(tensor_append)(ego->vecsz, ego->sz);
|
||||
X(dft_zerotens)(sz, UNTAINT(ego->ri), UNTAINT(ego->ii));
|
||||
X(tensor_destroy)(sz);
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_DFT,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz,
|
||||
R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
problem_dft *ego;
|
||||
|
||||
/* enforce pointer equality if untainted pointers are equal */
|
||||
if (UNTAINT(ri) == UNTAINT(ro))
|
||||
ri = ro = JOIN_TAINT(ri, ro);
|
||||
if (UNTAINT(ii) == UNTAINT(io))
|
||||
ii = io = JOIN_TAINT(ii, io);
|
||||
|
||||
/* more correctness conditions: */
|
||||
A(TAINTOF(ri) == TAINTOF(ii));
|
||||
A(TAINTOF(ro) == TAINTOF(io));
|
||||
|
||||
A(X(tensor_kosherp)(sz));
|
||||
A(X(tensor_kosherp)(vecsz));
|
||||
|
||||
if (ri == ro || ii == io) {
|
||||
/* If either real or imag pointers are in place, both must be. */
|
||||
if (ri != ro || ii != io || !X(tensor_inplace_locations)(sz, vecsz))
|
||||
return X(mkproblem_unsolvable)();
|
||||
}
|
||||
|
||||
ego = (problem_dft *)X(mkproblem)(sizeof(problem_dft), &padt);
|
||||
|
||||
ego->sz = X(tensor_compress)(sz);
|
||||
ego->vecsz = X(tensor_compress_contiguous)(vecsz);
|
||||
ego->ri = ri;
|
||||
ego->ii = ii;
|
||||
ego->ro = ro;
|
||||
ego->io = io;
|
||||
|
||||
A(FINITE_RNK(ego->sz->rnk));
|
||||
return &(ego->super);
|
||||
}
|
||||
|
||||
/* Same as X(mkproblem_dft), but also destroy input tensors. */
|
||||
problem *X(mkproblem_dft_d)(tensor *sz, tensor *vecsz,
|
||||
R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
problem *p = X(mkproblem_dft)(sz, vecsz, ri, ii, ro, io);
|
||||
X(tensor_destroy2)(vecsz, sz);
|
||||
return p;
|
||||
}
|
||||
@@ -0,0 +1,327 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
/*
|
||||
* Compute transforms of prime sizes using Rader's trick: turn them
|
||||
* into convolutions of size n - 1, which you then perform via a pair
|
||||
* of FFTs.
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
R *omega;
|
||||
INT n, g, ginv;
|
||||
INT is, os;
|
||||
plan *cld_omega;
|
||||
} P;
|
||||
|
||||
static rader_tl *omegas = 0;
|
||||
|
||||
static R *mkomega(enum wakefulness wakefulness, plan *p_, INT n, INT ginv)
|
||||
{
|
||||
plan_dft *p = (plan_dft *) p_;
|
||||
R *omega;
|
||||
INT i, gpower;
|
||||
trigreal scale;
|
||||
triggen *t;
|
||||
|
||||
if ((omega = X(rader_tl_find)(n, n, ginv, omegas)))
|
||||
return omega;
|
||||
|
||||
omega = (R *)MALLOC(sizeof(R) * (n - 1) * 2, TWIDDLES);
|
||||
|
||||
scale = n - 1.0; /* normalization for convolution */
|
||||
|
||||
t = X(mktriggen)(wakefulness, n);
|
||||
for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
|
||||
trigreal w[2];
|
||||
t->cexpl(t, gpower, w);
|
||||
omega[2*i] = w[0] / scale;
|
||||
omega[2*i+1] = FFT_SIGN * w[1] / scale;
|
||||
}
|
||||
X(triggen_destroy)(t);
|
||||
A(gpower == 1);
|
||||
|
||||
p->apply(p_, omega, omega + 1, omega, omega + 1);
|
||||
|
||||
X(rader_tl_insert)(n, n, ginv, omega, &omegas);
|
||||
return omega;
|
||||
}
|
||||
|
||||
static void free_omega(R *omega)
|
||||
{
|
||||
X(rader_tl_delete)(omega, &omegas);
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
/* Below, we extensively use the identity that fft(x*)* = ifft(x) in
|
||||
order to share data between forward and backward transforms and to
|
||||
obviate the necessity of having separate forward and backward
|
||||
plans. (Although we often compute separate plans these days anyway
|
||||
due to the differing strides, etcetera.)
|
||||
|
||||
Of course, since the new FFTW gives us separate pointers to
|
||||
the real and imaginary parts, we could have instead used the
|
||||
fft(r,i) = ifft(i,r) form of this identity, but it was easier to
|
||||
reuse the code from our old version. */
|
||||
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT is, os;
|
||||
INT k, gpower, g, r;
|
||||
R *buf;
|
||||
R r0 = ri[0], i0 = ii[0];
|
||||
|
||||
r = ego->n; is = ego->is; os = ego->os; g = ego->g;
|
||||
buf = (R *) MALLOC(sizeof(R) * (r - 1) * 2, BUFFERS);
|
||||
|
||||
/* First, permute the input, storing in buf: */
|
||||
for (gpower = 1, k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) {
|
||||
R rA, iA;
|
||||
rA = ri[gpower * is];
|
||||
iA = ii[gpower * is];
|
||||
buf[2*k] = rA; buf[2*k + 1] = iA;
|
||||
}
|
||||
/* gpower == g^(r-1) mod r == 1 */;
|
||||
|
||||
|
||||
/* compute DFT of buf, storing in output (except DC): */
|
||||
{
|
||||
plan_dft *cld = (plan_dft *) ego->cld1;
|
||||
cld->apply(ego->cld1, buf, buf+1, ro+os, io+os);
|
||||
}
|
||||
|
||||
/* set output DC component: */
|
||||
{
|
||||
ro[0] = r0 + ro[os];
|
||||
io[0] = i0 + io[os];
|
||||
}
|
||||
|
||||
/* now, multiply by omega: */
|
||||
{
|
||||
const R *omega = ego->omega;
|
||||
for (k = 0; k < r - 1; ++k) {
|
||||
E rB, iB, rW, iW;
|
||||
rW = omega[2*k];
|
||||
iW = omega[2*k+1];
|
||||
rB = ro[(k+1)*os];
|
||||
iB = io[(k+1)*os];
|
||||
ro[(k+1)*os] = rW * rB - iW * iB;
|
||||
io[(k+1)*os] = -(rW * iB + iW * rB);
|
||||
}
|
||||
}
|
||||
|
||||
/* this will add input[0] to all of the outputs after the ifft */
|
||||
ro[os] += r0;
|
||||
io[os] -= i0;
|
||||
|
||||
/* inverse FFT: */
|
||||
{
|
||||
plan_dft *cld = (plan_dft *) ego->cld2;
|
||||
cld->apply(ego->cld2, ro+os, io+os, buf, buf+1);
|
||||
}
|
||||
|
||||
/* finally, do inverse permutation to unshuffle the output: */
|
||||
{
|
||||
INT ginv = ego->ginv;
|
||||
gpower = 1;
|
||||
for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, ginv, r)) {
|
||||
ro[gpower * os] = buf[2*k];
|
||||
io[gpower * os] = -buf[2*k+1];
|
||||
}
|
||||
A(gpower == 1);
|
||||
}
|
||||
|
||||
|
||||
X(ifree)(buf);
|
||||
}
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
X(plan_awake)(ego->cld_omega, wakefulness);
|
||||
|
||||
switch (wakefulness) {
|
||||
case SLEEPY:
|
||||
free_omega(ego->omega);
|
||||
ego->omega = 0;
|
||||
break;
|
||||
default:
|
||||
ego->g = X(find_generator)(ego->n);
|
||||
ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
|
||||
A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
|
||||
|
||||
ego->omega = mkomega(wakefulness,
|
||||
ego->cld_omega, ego->n, ego->ginv);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld_omega);
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *)ego_;
|
||||
p->print(p, "(dft-rader-%D%ois=%oos=%(%p%)",
|
||||
ego->n, ego->is, ego->os, ego->cld1);
|
||||
if (ego->cld2 != ego->cld1)
|
||||
p->print(p, "%(%p%)", ego->cld2);
|
||||
if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
|
||||
p->print(p, "%(%p%)", ego->cld_omega);
|
||||
p->putchr(p, ')');
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
UNUSED(ego_);
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
|
||||
&& X(is_prime)(p->sz->dims[0].n)
|
||||
|
||||
/* proclaim the solver SLOW if p-1 is not easily factorizable.
|
||||
Bluestein should take care of this case. */
|
||||
&& CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
|
||||
);
|
||||
}
|
||||
|
||||
static int mkP(P *pln, INT n, INT is, INT os, R *ro, R *io,
|
||||
planner *plnr)
|
||||
{
|
||||
plan *cld1 = (plan *) 0;
|
||||
plan *cld2 = (plan *) 0;
|
||||
plan *cld_omega = (plan *) 0;
|
||||
R *buf = (R *) 0;
|
||||
|
||||
/* initial allocation for the purpose of planning */
|
||||
buf = (R *) MALLOC(sizeof(R) * (n - 1) * 2, BUFFERS);
|
||||
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, os),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
buf, buf + 1, ro + os, io + os),
|
||||
NO_SLOW, 0, 0);
|
||||
if (!cld1) goto nada;
|
||||
|
||||
cld2 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, os, 2),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
ro + os, io + os, buf, buf + 1),
|
||||
NO_SLOW, 0, 0);
|
||||
|
||||
if (!cld2) goto nada;
|
||||
|
||||
/* plan for omega array */
|
||||
cld_omega = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, 2),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
buf, buf + 1, buf, buf + 1),
|
||||
NO_SLOW, ESTIMATE, 0);
|
||||
if (!cld_omega) goto nada;
|
||||
|
||||
/* deallocate buffers; let awake() or apply() allocate them for real */
|
||||
X(ifree)(buf);
|
||||
buf = 0;
|
||||
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->cld_omega = cld_omega;
|
||||
pln->omega = 0;
|
||||
pln->n = n;
|
||||
pln->is = is;
|
||||
pln->os = os;
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
pln->super.super.ops.other += (n - 1) * (4 * 2 + 6) + 6;
|
||||
pln->super.super.ops.add += (n - 1) * 2 + 4;
|
||||
pln->super.super.ops.mul += (n - 1) * 4;
|
||||
|
||||
return 1;
|
||||
|
||||
nada:
|
||||
X(ifree0)(buf);
|
||||
X(plan_destroy_internal)(cld_omega);
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
P *pln;
|
||||
INT n;
|
||||
INT is, os;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
n = p->sz->dims[0].n;
|
||||
is = p->sz->dims[0].is;
|
||||
os = p->sz->dims[0].os;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply);
|
||||
if (!mkP(pln, n, is, os, p->ro, p->io, plnr)) {
|
||||
X(ifree)(pln);
|
||||
return (plan *) 0;
|
||||
}
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_rader_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
@@ -0,0 +1,202 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for DFT of rank >= 2 (multidimensional) */
|
||||
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int spltrnk;
|
||||
const int *buddies;
|
||||
size_t nbuddies;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
const S *solver;
|
||||
} P;
|
||||
|
||||
/* Compute multi-dimensional DFT by applying the two cld plans
|
||||
(lower-rnk DFTs). */
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld1, *cld2;
|
||||
|
||||
cld1 = (plan_dft *) ego->cld1;
|
||||
cld1->apply(ego->cld1, ri, ii, ro, io);
|
||||
|
||||
cld2 = (plan_dft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, ro, io, ro, io);
|
||||
}
|
||||
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->solver;
|
||||
p->print(p, "(dft-rank>=2/%d%(%p%)%(%p%))",
|
||||
s->spltrnk, ego->cld1, ego->cld2);
|
||||
}
|
||||
|
||||
static int picksplit(const S *ego, const tensor *sz, int *rp)
|
||||
{
|
||||
A(sz->rnk > 1); /* cannot split rnk <= 1 */
|
||||
if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
|
||||
return 0;
|
||||
*rp += 1; /* convert from dim. index to rank */
|
||||
if (*rp >= sz->rnk) /* split must reduce rank */
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_, int *rp)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
const S *ego = (const S *)ego_;
|
||||
return (1
|
||||
&& FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
|
||||
&& p->sz->rnk >= 2
|
||||
&& picksplit(ego, p->sz, rp)
|
||||
);
|
||||
}
|
||||
|
||||
/* TODO: revise this. */
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr, int *rp)
|
||||
{
|
||||
const S *ego = (const S *)ego_;
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
|
||||
if (!applicable0(ego_, p_, rp)) return 0;
|
||||
|
||||
if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0])) return 0;
|
||||
|
||||
/* Heuristic: if the vector stride is greater than the transform
|
||||
sz, don't use (prefer to do the vector loop first with a
|
||||
vrank-geq1 plan). */
|
||||
if (NO_UGLYP(plnr))
|
||||
if (p->vecsz->rnk > 0 &&
|
||||
X(tensor_min_stride)(p->vecsz) > X(tensor_max_index)(p->sz))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_dft *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0;
|
||||
tensor *sz1, *sz2, *vecszi, *sz2i;
|
||||
int spltrnk;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr, &spltrnk))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_dft *) p_;
|
||||
X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
|
||||
vecszi = X(tensor_copy_inplace)(p->vecsz, INPLACE_OS);
|
||||
sz2i = X(tensor_copy_inplace)(sz2, INPLACE_OS);
|
||||
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(tensor_copy)(sz2),
|
||||
X(tensor_append)(p->vecsz, sz1),
|
||||
p->ri, p->ii, p->ro, p->io));
|
||||
if (!cld1) goto nada;
|
||||
|
||||
cld2 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(tensor_copy_inplace)(sz1, INPLACE_OS),
|
||||
X(tensor_append)(vecszi, sz2i),
|
||||
p->ro, p->io, p->ro, p->io));
|
||||
if (!cld2) goto nada;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply);
|
||||
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
|
||||
pln->solver = ego;
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
|
||||
X(tensor_destroy4)(sz1, sz2, vecszi, sz2i);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
X(tensor_destroy4)(sz1, sz2, vecszi, sz2i);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int spltrnk, const int *buddies, size_t nbuddies)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->spltrnk = spltrnk;
|
||||
slv->buddies = buddies;
|
||||
slv->nbuddies = nbuddies;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_rank_geq2_register)(planner *p)
|
||||
{
|
||||
static const int buddies[] = { 1, 0, -2 };
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < NELEM(buddies); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(buddies[i], buddies, NELEM(buddies)));
|
||||
|
||||
/* FIXME:
|
||||
|
||||
Should we try more buddies?
|
||||
|
||||
Another possible variant is to swap cld1 and cld2 (or rather,
|
||||
to swap their problems; they are not interchangeable because
|
||||
cld2 must be in-place). In past versions of FFTW, however, I
|
||||
seem to recall that such rearrangements have made little or no
|
||||
difference.
|
||||
*/
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS=codelets
|
||||
noinst_LTLIBRARIES = libdft_scalar.la
|
||||
|
||||
libdft_scalar_la_SOURCES = n.c t.c f.h n.h q.h t.h
|
||||
|
||||
@@ -0,0 +1,757 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = dft/scalar
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
libdft_scalar_la_LIBADD =
|
||||
am_libdft_scalar_la_OBJECTS = n.lo t.lo
|
||||
libdft_scalar_la_OBJECTS = $(am_libdft_scalar_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/n.Plo ./$(DEPDIR)/t.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(libdft_scalar_la_SOURCES)
|
||||
DIST_SOURCES = $(libdft_scalar_la_SOURCES)
|
||||
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
|
||||
ctags-recursive dvi-recursive html-recursive info-recursive \
|
||||
install-data-recursive install-dvi-recursive \
|
||||
install-exec-recursive install-html-recursive \
|
||||
install-info-recursive install-pdf-recursive \
|
||||
install-ps-recursive install-recursive installcheck-recursive \
|
||||
installdirs-recursive pdf-recursive ps-recursive \
|
||||
tags-recursive uninstall-recursive
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
|
||||
distclean-recursive maintainer-clean-recursive
|
||||
am__recursive_targets = \
|
||||
$(RECURSIVE_TARGETS) \
|
||||
$(RECURSIVE_CLEAN_TARGETS) \
|
||||
$(am__extra_recursive_targets)
|
||||
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
|
||||
distdir distdir-am
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
DIST_SUBDIRS = $(SUBDIRS)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
am__relativize = \
|
||||
dir0=`pwd`; \
|
||||
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
|
||||
sed_rest='s,^[^/]*/*,,'; \
|
||||
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
|
||||
sed_butlast='s,/*[^/]*$$,,'; \
|
||||
while test -n "$$dir1"; do \
|
||||
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
|
||||
if test "$$first" != "."; then \
|
||||
if test "$$first" = ".."; then \
|
||||
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
|
||||
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
|
||||
else \
|
||||
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
|
||||
if test "$$first2" = "$$first"; then \
|
||||
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
|
||||
else \
|
||||
dir2="../$$dir2"; \
|
||||
fi; \
|
||||
dir0="$$dir0"/"$$first"; \
|
||||
fi; \
|
||||
fi; \
|
||||
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
|
||||
done; \
|
||||
reldir="$$dir2"
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = codelets
|
||||
noinst_LTLIBRARIES = libdft_scalar.la
|
||||
libdft_scalar_la_SOURCES = n.c t.c f.h n.h q.h t.h
|
||||
all: all-recursive
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu dft/scalar/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
libdft_scalar.la: $(libdft_scalar_la_OBJECTS) $(libdft_scalar_la_DEPENDENCIES) $(EXTRA_libdft_scalar_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(libdft_scalar_la_OBJECTS) $(libdft_scalar_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
# This directory's subdirectories are mostly independent; you can cd
|
||||
# into them and run 'make' without going through this Makefile.
|
||||
# To change the values of 'make' variables: instead of editing Makefiles,
|
||||
# (1) if the variable is set in 'config.status', edit 'config.status'
|
||||
# (which will cause the Makefiles to be regenerated when you run 'make');
|
||||
# (2) otherwise, pass the desired values on the 'make' command line.
|
||||
$(am__recursive_targets):
|
||||
@fail=; \
|
||||
if $(am__make_keepgoing); then \
|
||||
failcom='fail=yes'; \
|
||||
else \
|
||||
failcom='exit 1'; \
|
||||
fi; \
|
||||
dot_seen=no; \
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
case "$@" in \
|
||||
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
|
||||
*) list='$(SUBDIRS)' ;; \
|
||||
esac; \
|
||||
for subdir in $$list; do \
|
||||
echo "Making $$target in $$subdir"; \
|
||||
if test "$$subdir" = "."; then \
|
||||
dot_seen=yes; \
|
||||
local_target="$$target-am"; \
|
||||
else \
|
||||
local_target="$$target"; \
|
||||
fi; \
|
||||
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|
||||
|| eval $$failcom; \
|
||||
done; \
|
||||
if test "$$dot_seen" = "no"; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
|
||||
fi; test -z "$$fail"
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-recursive
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
|
||||
include_option=--etags-include; \
|
||||
empty_fix=.; \
|
||||
else \
|
||||
include_option=--include; \
|
||||
empty_fix=; \
|
||||
fi; \
|
||||
list='$(SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
test ! -f $$subdir/TAGS || \
|
||||
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
|
||||
fi; \
|
||||
done; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-recursive
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-recursive
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
$(am__make_dryrun) \
|
||||
|| test -d "$(distdir)/$$subdir" \
|
||||
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|
||||
|| exit 1; \
|
||||
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
|
||||
$(am__relativize); \
|
||||
new_distdir=$$reldir; \
|
||||
dir1=$$subdir; dir2="$(top_distdir)"; \
|
||||
$(am__relativize); \
|
||||
new_top_distdir=$$reldir; \
|
||||
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
|
||||
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
|
||||
($(am__cd) $$subdir && \
|
||||
$(MAKE) $(AM_MAKEFLAGS) \
|
||||
top_distdir="$$new_top_distdir" \
|
||||
distdir="$$new_distdir" \
|
||||
am__remove_distdir=: \
|
||||
am__skip_length_check=: \
|
||||
am__skip_mode_fix=: \
|
||||
distdir) \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-recursive
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs: installdirs-recursive
|
||||
installdirs-am:
|
||||
install: install-recursive
|
||||
install-exec: install-exec-recursive
|
||||
install-data: install-data-recursive
|
||||
uninstall: uninstall-recursive
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-recursive
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-recursive
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-recursive
|
||||
-rm -f ./$(DEPDIR)/n.Plo
|
||||
-rm -f ./$(DEPDIR)/t.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-recursive
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-recursive
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-recursive
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-recursive
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-recursive
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-recursive
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-recursive
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-recursive
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-recursive
|
||||
-rm -f ./$(DEPDIR)/n.Plo
|
||||
-rm -f ./$(DEPDIR)/t.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-recursive
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-recursive
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-recursive
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: $(am__recursive_targets) install-am install-strip
|
||||
|
||||
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
|
||||
am--depfiles check check-am clean clean-generic clean-libtool \
|
||||
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
|
||||
distclean-compile distclean-generic distclean-libtool \
|
||||
distclean-tags distdir dvi dvi-am html html-am info info-am \
|
||||
install install-am install-data install-data-am install-dvi \
|
||||
install-dvi-am install-exec install-exec-am install-html \
|
||||
install-html-am install-info install-info-am install-man \
|
||||
install-pdf install-pdf-am install-ps install-ps-am \
|
||||
install-strip installcheck installcheck-am installdirs \
|
||||
installdirs-am maintainer-clean maintainer-clean-generic \
|
||||
mostlyclean mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
|
||||
uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
@@ -0,0 +1,96 @@
|
||||
# This Makefile.am specifies a set of codelets, efficient transforms
|
||||
# of small sizes, that are used as building blocks (kernels) by FFTW
|
||||
# to build up large transforms, as well as the options for generating
|
||||
# and compiling them.
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
###########################################################################
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = libdft_scalar_codelets.la
|
||||
|
||||
###########################################################################
|
||||
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
|
||||
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
|
||||
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
|
||||
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
|
||||
|
||||
###########################################################################
|
||||
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
|
||||
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
|
||||
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
|
||||
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
|
||||
|
||||
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
|
||||
# of trig. functions, it partially generates the trig. values on the fly
|
||||
# (this is faster for large sizes).
|
||||
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
|
||||
t2_5.c t2_10.c t2_20.c t2_25.c
|
||||
|
||||
###########################################################################
|
||||
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
|
||||
# but the planner seems to never (or hardly ever) use them on the machines
|
||||
# we have access to, preferring the Q codelets and the use of buffers
|
||||
# for sub-transforms. So, we comment them out, at least for now.
|
||||
|
||||
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
|
||||
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
|
||||
|
||||
# like f1, but partially generates its trig. table on the fly
|
||||
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
|
||||
|
||||
###########################################################################
|
||||
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
|
||||
# transposed. This is used for in-place transposes in sizes that are
|
||||
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
|
||||
# probably not use <r> bigger than 8 or so.
|
||||
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
|
||||
|
||||
###########################################################################
|
||||
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
|
||||
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
|
||||
|
||||
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
|
||||
|
||||
SOLVTAB_NAME = X(solvtab_dft_standard)
|
||||
XRENAME=X
|
||||
|
||||
# special rules for regenerating codelets.
|
||||
include $(top_srcdir)/support/Makefile.codelets
|
||||
|
||||
if MAINTAINER_MODE
|
||||
FLAGS_N1=$(DFT_FLAGS_COMMON)
|
||||
FLAGS_T1=$(DFT_FLAGS_COMMON)
|
||||
FLAGS_T2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_F1=$(DFT_FLAGS_COMMON)
|
||||
FLAGS_F2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_Q1=$(DFT_FLAGS_COMMON) -reload-twiddle
|
||||
FLAGS_Q2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
|
||||
n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
endif # MAINTAINER_MODE
|
||||
@@ -0,0 +1,994 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
# This Makefile.am specifies a set of codelets, efficient transforms
|
||||
# of small sizes, that are used as building blocks (kernels) by FFTW
|
||||
# to build up large transforms, as well as the options for generating
|
||||
# and compiling them.
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
# -*- makefile -*-
|
||||
# This file contains special make rules to generate codelets.
|
||||
# Most of this file requires GNU make .
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = dft/scalar/codelets
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
libdft_scalar_codelets_la_LIBADD =
|
||||
am__objects_1 = n1_2.lo n1_3.lo n1_4.lo n1_5.lo n1_6.lo n1_7.lo \
|
||||
n1_8.lo n1_9.lo n1_10.lo n1_11.lo n1_12.lo n1_13.lo n1_14.lo \
|
||||
n1_15.lo n1_16.lo n1_32.lo n1_64.lo n1_20.lo n1_25.lo
|
||||
am__objects_2 = t1_2.lo t1_3.lo t1_4.lo t1_5.lo t1_6.lo t1_7.lo \
|
||||
t1_8.lo t1_9.lo t1_10.lo t1_12.lo t1_15.lo t1_16.lo t1_32.lo \
|
||||
t1_64.lo t1_20.lo t1_25.lo
|
||||
am__objects_3 = t2_4.lo t2_8.lo t2_16.lo t2_32.lo t2_64.lo t2_5.lo \
|
||||
t2_10.lo t2_20.lo t2_25.lo
|
||||
am__objects_4 =
|
||||
am__objects_5 = q1_2.lo q1_4.lo q1_8.lo q1_3.lo q1_5.lo q1_6.lo
|
||||
am__objects_6 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
|
||||
$(am__objects_4) $(am__objects_4) $(am__objects_5)
|
||||
am__objects_7 = codlist.lo
|
||||
am__objects_8 = $(am__objects_6) $(am__objects_7)
|
||||
am_libdft_scalar_codelets_la_OBJECTS = $(am__objects_8)
|
||||
libdft_scalar_codelets_la_OBJECTS = \
|
||||
$(am_libdft_scalar_codelets_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/codlist.Plo ./$(DEPDIR)/n1_10.Plo \
|
||||
./$(DEPDIR)/n1_11.Plo ./$(DEPDIR)/n1_12.Plo \
|
||||
./$(DEPDIR)/n1_13.Plo ./$(DEPDIR)/n1_14.Plo \
|
||||
./$(DEPDIR)/n1_15.Plo ./$(DEPDIR)/n1_16.Plo \
|
||||
./$(DEPDIR)/n1_2.Plo ./$(DEPDIR)/n1_20.Plo \
|
||||
./$(DEPDIR)/n1_25.Plo ./$(DEPDIR)/n1_3.Plo \
|
||||
./$(DEPDIR)/n1_32.Plo ./$(DEPDIR)/n1_4.Plo \
|
||||
./$(DEPDIR)/n1_5.Plo ./$(DEPDIR)/n1_6.Plo \
|
||||
./$(DEPDIR)/n1_64.Plo ./$(DEPDIR)/n1_7.Plo \
|
||||
./$(DEPDIR)/n1_8.Plo ./$(DEPDIR)/n1_9.Plo ./$(DEPDIR)/q1_2.Plo \
|
||||
./$(DEPDIR)/q1_3.Plo ./$(DEPDIR)/q1_4.Plo ./$(DEPDIR)/q1_5.Plo \
|
||||
./$(DEPDIR)/q1_6.Plo ./$(DEPDIR)/q1_8.Plo \
|
||||
./$(DEPDIR)/t1_10.Plo ./$(DEPDIR)/t1_12.Plo \
|
||||
./$(DEPDIR)/t1_15.Plo ./$(DEPDIR)/t1_16.Plo \
|
||||
./$(DEPDIR)/t1_2.Plo ./$(DEPDIR)/t1_20.Plo \
|
||||
./$(DEPDIR)/t1_25.Plo ./$(DEPDIR)/t1_3.Plo \
|
||||
./$(DEPDIR)/t1_32.Plo ./$(DEPDIR)/t1_4.Plo \
|
||||
./$(DEPDIR)/t1_5.Plo ./$(DEPDIR)/t1_6.Plo \
|
||||
./$(DEPDIR)/t1_64.Plo ./$(DEPDIR)/t1_7.Plo \
|
||||
./$(DEPDIR)/t1_8.Plo ./$(DEPDIR)/t1_9.Plo \
|
||||
./$(DEPDIR)/t2_10.Plo ./$(DEPDIR)/t2_16.Plo \
|
||||
./$(DEPDIR)/t2_20.Plo ./$(DEPDIR)/t2_25.Plo \
|
||||
./$(DEPDIR)/t2_32.Plo ./$(DEPDIR)/t2_4.Plo \
|
||||
./$(DEPDIR)/t2_5.Plo ./$(DEPDIR)/t2_64.Plo \
|
||||
./$(DEPDIR)/t2_8.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(libdft_scalar_codelets_la_SOURCES)
|
||||
DIST_SOURCES = $(libdft_scalar_codelets_la_SOURCES)
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp \
|
||||
$(top_srcdir)/support/Makefile.codelets
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
###########################################################################
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = libdft_scalar_codelets.la
|
||||
|
||||
###########################################################################
|
||||
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
|
||||
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
|
||||
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
|
||||
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
|
||||
|
||||
|
||||
###########################################################################
|
||||
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
|
||||
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
|
||||
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
|
||||
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
|
||||
|
||||
|
||||
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
|
||||
# of trig. functions, it partially generates the trig. values on the fly
|
||||
# (this is faster for large sizes).
|
||||
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
|
||||
t2_5.c t2_10.c t2_20.c t2_25.c
|
||||
|
||||
|
||||
###########################################################################
|
||||
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
|
||||
# but the planner seems to never (or hardly ever) use them on the machines
|
||||
# we have access to, preferring the Q codelets and the use of buffers
|
||||
# for sub-transforms. So, we comment them out, at least for now.
|
||||
|
||||
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
|
||||
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
|
||||
|
||||
# like f1, but partially generates its trig. table on the fly
|
||||
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
|
||||
|
||||
###########################################################################
|
||||
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
|
||||
# transposed. This is used for in-place transposes in sizes that are
|
||||
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
|
||||
# probably not use <r> bigger than 8 or so.
|
||||
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
|
||||
|
||||
###########################################################################
|
||||
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
|
||||
BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
|
||||
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
|
||||
SOLVTAB_NAME = X(solvtab_dft_standard)
|
||||
XRENAME = X
|
||||
CODLIST = codlist.c
|
||||
CODELET_NAME = codelet_
|
||||
|
||||
#INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
|
||||
@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
|
||||
@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
|
||||
@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
|
||||
@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
|
||||
@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
|
||||
@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE)
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
|
||||
@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
|
||||
@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
|
||||
|
||||
# special rules for regenerating codelets.
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_N1 = $(DFT_FLAGS_COMMON)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_T1 = $(DFT_FLAGS_COMMON)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_T2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_F1 = $(DFT_FLAGS_COMMON)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_F2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_Q1 = $(DFT_FLAGS_COMMON) -reload-twiddle
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_Q2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
all: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) all-am
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/codelets/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu dft/scalar/codelets/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
$(top_srcdir)/support/Makefile.codelets $(am__empty):
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
libdft_scalar_codelets.la: $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_DEPENDENCIES) $(EXTRA_libdft_scalar_codelets_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_10.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_11.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_12.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_13.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_14.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_15.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_16.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_20.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_25.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_3.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_32.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_6.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_64.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_7.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_8.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_9.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_3.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_6.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_8.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_10.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_12.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_15.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_16.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_20.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_25.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_3.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_32.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_6.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_64.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_7.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_8.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_9.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_10.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_16.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_20.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_25.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_32.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_64.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_8.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-am
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-am
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-am
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) check-am
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs:
|
||||
install: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) install-am
|
||||
install-exec: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) install-exec-am
|
||||
install-data: install-data-am
|
||||
uninstall: uninstall-am
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-am
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
|
||||
clean: clean-am
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-am
|
||||
-rm -f ./$(DEPDIR)/codlist.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_11.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_13.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_14.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_8.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-am
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-am
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-am
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-am
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-am
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-am
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-am
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-am
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-am
|
||||
-rm -f ./$(DEPDIR)/codlist.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_11.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_13.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_14.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_8.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic \
|
||||
maintainer-clean-local
|
||||
|
||||
mostlyclean: mostlyclean-am
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-am
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-am
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: all check install install-am install-exec install-strip
|
||||
|
||||
.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
|
||||
clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
cscopelist-am ctags ctags-am distclean distclean-compile \
|
||||
distclean-generic distclean-libtool distclean-tags distdir dvi \
|
||||
dvi-am html html-am info info-am install install-am \
|
||||
install-data install-data-am install-dvi install-dvi-am \
|
||||
install-exec install-exec-am install-html install-html-am \
|
||||
install-info install-info-am install-man install-pdf \
|
||||
install-pdf-am install-ps install-ps-am install-strip \
|
||||
installcheck installcheck-am installdirs maintainer-clean \
|
||||
maintainer-clean-generic maintainer-clean-local mostlyclean \
|
||||
mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
|
||||
pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# only delete codlist.c in maintainer-mode, since it is included in the dist
|
||||
# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
|
||||
maintainer-clean-local:
|
||||
rm -f $(CODLIST)
|
||||
|
||||
# rule to build codlist
|
||||
@MAINTAINER_MODE_TRUE@$(CODLIST): Makefile
|
||||
@MAINTAINER_MODE_TRUE@ ( \
|
||||
@MAINTAINER_MODE_TRUE@ echo "#include \"kernel/ifftw.h\""; \
|
||||
@MAINTAINER_MODE_TRUE@ echo $(INCLUDE_SIMD_HEADER); \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
|
||||
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
|
||||
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);"; \
|
||||
@MAINTAINER_MODE_TRUE@ fi \
|
||||
@MAINTAINER_MODE_TRUE@ done; \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "extern const solvtab $(SOLVTAB_NAME);"; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "const solvtab $(SOLVTAB_NAME) = {"; \
|
||||
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
|
||||
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
|
||||
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
|
||||
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),"; \
|
||||
@MAINTAINER_MODE_TRUE@ fi \
|
||||
@MAINTAINER_MODE_TRUE@ done; \
|
||||
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB_END"; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "};"; \
|
||||
@MAINTAINER_MODE_TRUE@ ) >$@
|
||||
|
||||
# cancel the hideous builtin rules that cause an infinite loop
|
||||
@MAINTAINER_MODE_TRUE@%: %.o
|
||||
@MAINTAINER_MODE_TRUE@%: %.s
|
||||
@MAINTAINER_MODE_TRUE@%: %.c
|
||||
@MAINTAINER_MODE_TRUE@%: %.S
|
||||
|
||||
@MAINTAINER_MODE_TRUE@n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
@@ -0,0 +1,109 @@
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
|
||||
extern void X(codelet_n1_2)(planner *);
|
||||
extern void X(codelet_n1_3)(planner *);
|
||||
extern void X(codelet_n1_4)(planner *);
|
||||
extern void X(codelet_n1_5)(planner *);
|
||||
extern void X(codelet_n1_6)(planner *);
|
||||
extern void X(codelet_n1_7)(planner *);
|
||||
extern void X(codelet_n1_8)(planner *);
|
||||
extern void X(codelet_n1_9)(planner *);
|
||||
extern void X(codelet_n1_10)(planner *);
|
||||
extern void X(codelet_n1_11)(planner *);
|
||||
extern void X(codelet_n1_12)(planner *);
|
||||
extern void X(codelet_n1_13)(planner *);
|
||||
extern void X(codelet_n1_14)(planner *);
|
||||
extern void X(codelet_n1_15)(planner *);
|
||||
extern void X(codelet_n1_16)(planner *);
|
||||
extern void X(codelet_n1_32)(planner *);
|
||||
extern void X(codelet_n1_64)(planner *);
|
||||
extern void X(codelet_n1_20)(planner *);
|
||||
extern void X(codelet_n1_25)(planner *);
|
||||
extern void X(codelet_t1_2)(planner *);
|
||||
extern void X(codelet_t1_3)(planner *);
|
||||
extern void X(codelet_t1_4)(planner *);
|
||||
extern void X(codelet_t1_5)(planner *);
|
||||
extern void X(codelet_t1_6)(planner *);
|
||||
extern void X(codelet_t1_7)(planner *);
|
||||
extern void X(codelet_t1_8)(planner *);
|
||||
extern void X(codelet_t1_9)(planner *);
|
||||
extern void X(codelet_t1_10)(planner *);
|
||||
extern void X(codelet_t1_12)(planner *);
|
||||
extern void X(codelet_t1_15)(planner *);
|
||||
extern void X(codelet_t1_16)(planner *);
|
||||
extern void X(codelet_t1_32)(planner *);
|
||||
extern void X(codelet_t1_64)(planner *);
|
||||
extern void X(codelet_t1_20)(planner *);
|
||||
extern void X(codelet_t1_25)(planner *);
|
||||
extern void X(codelet_t2_4)(planner *);
|
||||
extern void X(codelet_t2_8)(planner *);
|
||||
extern void X(codelet_t2_16)(planner *);
|
||||
extern void X(codelet_t2_32)(planner *);
|
||||
extern void X(codelet_t2_64)(planner *);
|
||||
extern void X(codelet_t2_5)(planner *);
|
||||
extern void X(codelet_t2_10)(planner *);
|
||||
extern void X(codelet_t2_20)(planner *);
|
||||
extern void X(codelet_t2_25)(planner *);
|
||||
extern void X(codelet_q1_2)(planner *);
|
||||
extern void X(codelet_q1_4)(planner *);
|
||||
extern void X(codelet_q1_8)(planner *);
|
||||
extern void X(codelet_q1_3)(planner *);
|
||||
extern void X(codelet_q1_5)(planner *);
|
||||
extern void X(codelet_q1_6)(planner *);
|
||||
|
||||
|
||||
extern const solvtab X(solvtab_dft_standard);
|
||||
const solvtab X(solvtab_dft_standard) = {
|
||||
SOLVTAB(X(codelet_n1_2)),
|
||||
SOLVTAB(X(codelet_n1_3)),
|
||||
SOLVTAB(X(codelet_n1_4)),
|
||||
SOLVTAB(X(codelet_n1_5)),
|
||||
SOLVTAB(X(codelet_n1_6)),
|
||||
SOLVTAB(X(codelet_n1_7)),
|
||||
SOLVTAB(X(codelet_n1_8)),
|
||||
SOLVTAB(X(codelet_n1_9)),
|
||||
SOLVTAB(X(codelet_n1_10)),
|
||||
SOLVTAB(X(codelet_n1_11)),
|
||||
SOLVTAB(X(codelet_n1_12)),
|
||||
SOLVTAB(X(codelet_n1_13)),
|
||||
SOLVTAB(X(codelet_n1_14)),
|
||||
SOLVTAB(X(codelet_n1_15)),
|
||||
SOLVTAB(X(codelet_n1_16)),
|
||||
SOLVTAB(X(codelet_n1_32)),
|
||||
SOLVTAB(X(codelet_n1_64)),
|
||||
SOLVTAB(X(codelet_n1_20)),
|
||||
SOLVTAB(X(codelet_n1_25)),
|
||||
SOLVTAB(X(codelet_t1_2)),
|
||||
SOLVTAB(X(codelet_t1_3)),
|
||||
SOLVTAB(X(codelet_t1_4)),
|
||||
SOLVTAB(X(codelet_t1_5)),
|
||||
SOLVTAB(X(codelet_t1_6)),
|
||||
SOLVTAB(X(codelet_t1_7)),
|
||||
SOLVTAB(X(codelet_t1_8)),
|
||||
SOLVTAB(X(codelet_t1_9)),
|
||||
SOLVTAB(X(codelet_t1_10)),
|
||||
SOLVTAB(X(codelet_t1_12)),
|
||||
SOLVTAB(X(codelet_t1_15)),
|
||||
SOLVTAB(X(codelet_t1_16)),
|
||||
SOLVTAB(X(codelet_t1_32)),
|
||||
SOLVTAB(X(codelet_t1_64)),
|
||||
SOLVTAB(X(codelet_t1_20)),
|
||||
SOLVTAB(X(codelet_t1_25)),
|
||||
SOLVTAB(X(codelet_t2_4)),
|
||||
SOLVTAB(X(codelet_t2_8)),
|
||||
SOLVTAB(X(codelet_t2_16)),
|
||||
SOLVTAB(X(codelet_t2_32)),
|
||||
SOLVTAB(X(codelet_t2_64)),
|
||||
SOLVTAB(X(codelet_t2_5)),
|
||||
SOLVTAB(X(codelet_t2_10)),
|
||||
SOLVTAB(X(codelet_t2_20)),
|
||||
SOLVTAB(X(codelet_t2_25)),
|
||||
SOLVTAB(X(codelet_q1_2)),
|
||||
SOLVTAB(X(codelet_q1_4)),
|
||||
SOLVTAB(X(codelet_q1_8)),
|
||||
SOLVTAB(X(codelet_q1_3)),
|
||||
SOLVTAB(X(codelet_q1_5)),
|
||||
SOLVTAB(X(codelet_q1_6)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 84 FP additions, 36 FP multiplications,
|
||||
* (or, 48 additions, 0 multiplications, 36 fused multiply/add),
|
||||
* 41 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
|
||||
E T3, Tj, TN, T1b, TU, TV, T1j, T1i, Tm, Tp, Tq, Ta, Th, Ti, TA;
|
||||
E TH, T17, T14, T1c, T1d, T1e, TO, TP, TQ;
|
||||
{
|
||||
E T1, T2, TL, TM;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = T1 - T2;
|
||||
Tj = T1 + T2;
|
||||
TL = ii[0];
|
||||
TM = ii[WS(is, 5)];
|
||||
TN = TL - TM;
|
||||
T1b = TL + TM;
|
||||
}
|
||||
{
|
||||
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
|
||||
{
|
||||
E T4, T5, Te, Tf;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 7)];
|
||||
T6 = T4 - T5;
|
||||
Tk = T4 + T5;
|
||||
Te = ri[WS(is, 6)];
|
||||
Tf = ri[WS(is, 1)];
|
||||
Tg = Te - Tf;
|
||||
To = Te + Tf;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tb, Tc;
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = T7 - T8;
|
||||
Tl = T7 + T8;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 9)];
|
||||
Td = Tb - Tc;
|
||||
Tn = Tb + Tc;
|
||||
}
|
||||
TU = T6 - T9;
|
||||
TV = Td - Tg;
|
||||
T1j = Tk - Tl;
|
||||
T1i = Tn - To;
|
||||
Tm = Tk + Tl;
|
||||
Tp = Tn + To;
|
||||
Tq = Tm + Tp;
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
}
|
||||
{
|
||||
E Tw, T15, TG, T13, Tz, T16, TD, T12;
|
||||
{
|
||||
E Tu, Tv, TE, TF;
|
||||
Tu = ii[WS(is, 2)];
|
||||
Tv = ii[WS(is, 7)];
|
||||
Tw = Tu - Tv;
|
||||
T15 = Tu + Tv;
|
||||
TE = ii[WS(is, 6)];
|
||||
TF = ii[WS(is, 1)];
|
||||
TG = TE - TF;
|
||||
T13 = TE + TF;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, TB, TC;
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = ii[WS(is, 3)];
|
||||
Tz = Tx - Ty;
|
||||
T16 = Tx + Ty;
|
||||
TB = ii[WS(is, 4)];
|
||||
TC = ii[WS(is, 9)];
|
||||
TD = TB - TC;
|
||||
T12 = TB + TC;
|
||||
}
|
||||
TA = Tw - Tz;
|
||||
TH = TD - TG;
|
||||
T17 = T15 - T16;
|
||||
T14 = T12 - T13;
|
||||
T1c = T15 + T16;
|
||||
T1d = T12 + T13;
|
||||
T1e = T1c + T1d;
|
||||
TO = Tw + Tz;
|
||||
TP = TD + TG;
|
||||
TQ = TO + TP;
|
||||
}
|
||||
ro[WS(os, 5)] = T3 + Ti;
|
||||
io[WS(os, 5)] = TN + TQ;
|
||||
ro[0] = Tj + Tq;
|
||||
io[0] = T1b + T1e;
|
||||
{
|
||||
E TI, TK, Tt, TJ, Tr, Ts;
|
||||
TI = FMA(KP618033988, TH, TA);
|
||||
TK = FNMS(KP618033988, TA, TH);
|
||||
Tr = FNMS(KP250000000, Ti, T3);
|
||||
Ts = Ta - Th;
|
||||
Tt = FMA(KP559016994, Ts, Tr);
|
||||
TJ = FNMS(KP559016994, Ts, Tr);
|
||||
ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
|
||||
ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
|
||||
}
|
||||
{
|
||||
E TW, TY, TT, TX, TR, TS;
|
||||
TW = FMA(KP618033988, TV, TU);
|
||||
TY = FNMS(KP618033988, TU, TV);
|
||||
TR = FNMS(KP250000000, TQ, TN);
|
||||
TS = TO - TP;
|
||||
TT = FMA(KP559016994, TS, TR);
|
||||
TX = FNMS(KP559016994, TS, TR);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
|
||||
io[WS(os, 7)] = FMA(KP951056516, TY, TX);
|
||||
io[WS(os, 9)] = FMA(KP951056516, TW, TT);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
|
||||
}
|
||||
{
|
||||
E T18, T1a, T11, T19, TZ, T10;
|
||||
T18 = FNMS(KP618033988, T17, T14);
|
||||
T1a = FMA(KP618033988, T14, T17);
|
||||
TZ = FNMS(KP250000000, Tq, Tj);
|
||||
T10 = Tm - Tp;
|
||||
T11 = FNMS(KP559016994, T10, TZ);
|
||||
T19 = FMA(KP559016994, T10, TZ);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
|
||||
ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
|
||||
ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
|
||||
}
|
||||
{
|
||||
E T1k, T1m, T1h, T1l, T1f, T1g;
|
||||
T1k = FNMS(KP618033988, T1j, T1i);
|
||||
T1m = FMA(KP618033988, T1i, T1j);
|
||||
T1f = FNMS(KP250000000, T1e, T1b);
|
||||
T1g = T1c - T1d;
|
||||
T1h = FNMS(KP559016994, T1g, T1f);
|
||||
T1l = FMA(KP559016994, T1g, T1f);
|
||||
io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
|
||||
io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
|
||||
io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
|
||||
io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 10, "n1_10", { 48, 0, 36, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 84 FP additions, 24 FP multiplications,
|
||||
* (or, 72 additions, 12 multiplications, 12 fused multiply/add),
|
||||
* 41 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
|
||||
E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
|
||||
E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
|
||||
{
|
||||
E T1, T2, TO, TP;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = T1 - T2;
|
||||
Tj = T1 + T2;
|
||||
TO = ii[0];
|
||||
TP = ii[WS(is, 5)];
|
||||
TQ = TO - TP;
|
||||
T1e = TO + TP;
|
||||
}
|
||||
{
|
||||
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
|
||||
{
|
||||
E T4, T5, Te, Tf;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 7)];
|
||||
T6 = T4 - T5;
|
||||
Tk = T4 + T5;
|
||||
Te = ri[WS(is, 6)];
|
||||
Tf = ri[WS(is, 1)];
|
||||
Tg = Te - Tf;
|
||||
To = Te + Tf;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tb, Tc;
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = T7 - T8;
|
||||
Tl = T7 + T8;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 9)];
|
||||
Td = Tb - Tc;
|
||||
Tn = Tb + Tc;
|
||||
}
|
||||
TU = T6 - T9;
|
||||
TV = Td - Tg;
|
||||
T1c = Tk - Tl;
|
||||
T1b = Tn - To;
|
||||
Tm = Tk + Tl;
|
||||
Tp = Tn + To;
|
||||
Tq = Tm + Tp;
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
}
|
||||
{
|
||||
E Tw, T15, TG, T13, Tz, T16, TD, T12;
|
||||
{
|
||||
E Tu, Tv, TE, TF;
|
||||
Tu = ii[WS(is, 2)];
|
||||
Tv = ii[WS(is, 7)];
|
||||
Tw = Tu - Tv;
|
||||
T15 = Tu + Tv;
|
||||
TE = ii[WS(is, 6)];
|
||||
TF = ii[WS(is, 1)];
|
||||
TG = TE - TF;
|
||||
T13 = TE + TF;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, TB, TC;
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = ii[WS(is, 3)];
|
||||
Tz = Tx - Ty;
|
||||
T16 = Tx + Ty;
|
||||
TB = ii[WS(is, 4)];
|
||||
TC = ii[WS(is, 9)];
|
||||
TD = TB - TC;
|
||||
T12 = TB + TC;
|
||||
}
|
||||
TA = Tw - Tz;
|
||||
TH = TD - TG;
|
||||
T17 = T15 - T16;
|
||||
T14 = T12 - T13;
|
||||
T1f = T15 + T16;
|
||||
T1g = T12 + T13;
|
||||
T1h = T1f + T1g;
|
||||
TL = Tw + Tz;
|
||||
TM = TD + TG;
|
||||
TR = TL + TM;
|
||||
}
|
||||
ro[WS(os, 5)] = T3 + Ti;
|
||||
io[WS(os, 5)] = TQ + TR;
|
||||
ro[0] = Tj + Tq;
|
||||
io[0] = T1e + T1h;
|
||||
{
|
||||
E TI, TK, Tt, TJ, Tr, Ts;
|
||||
TI = FMA(KP951056516, TA, KP587785252 * TH);
|
||||
TK = FNMS(KP587785252, TA, KP951056516 * TH);
|
||||
Tr = KP559016994 * (Ta - Th);
|
||||
Ts = FNMS(KP250000000, Ti, T3);
|
||||
Tt = Tr + Ts;
|
||||
TJ = Ts - Tr;
|
||||
ro[WS(os, 9)] = Tt - TI;
|
||||
ro[WS(os, 3)] = TJ + TK;
|
||||
ro[WS(os, 1)] = Tt + TI;
|
||||
ro[WS(os, 7)] = TJ - TK;
|
||||
}
|
||||
{
|
||||
E TW, TY, TT, TX, TN, TS;
|
||||
TW = FMA(KP951056516, TU, KP587785252 * TV);
|
||||
TY = FNMS(KP587785252, TU, KP951056516 * TV);
|
||||
TN = KP559016994 * (TL - TM);
|
||||
TS = FNMS(KP250000000, TR, TQ);
|
||||
TT = TN + TS;
|
||||
TX = TS - TN;
|
||||
io[WS(os, 1)] = TT - TW;
|
||||
io[WS(os, 7)] = TY + TX;
|
||||
io[WS(os, 9)] = TW + TT;
|
||||
io[WS(os, 3)] = TX - TY;
|
||||
}
|
||||
{
|
||||
E T18, T1a, T11, T19, TZ, T10;
|
||||
T18 = FNMS(KP587785252, T17, KP951056516 * T14);
|
||||
T1a = FMA(KP951056516, T17, KP587785252 * T14);
|
||||
TZ = FNMS(KP250000000, Tq, Tj);
|
||||
T10 = KP559016994 * (Tm - Tp);
|
||||
T11 = TZ - T10;
|
||||
T19 = T10 + TZ;
|
||||
ro[WS(os, 2)] = T11 - T18;
|
||||
ro[WS(os, 6)] = T19 + T1a;
|
||||
ro[WS(os, 8)] = T11 + T18;
|
||||
ro[WS(os, 4)] = T19 - T1a;
|
||||
}
|
||||
{
|
||||
E T1d, T1l, T1k, T1m, T1i, T1j;
|
||||
T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
|
||||
T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
|
||||
T1i = FNMS(KP250000000, T1h, T1e);
|
||||
T1j = KP559016994 * (T1f - T1g);
|
||||
T1k = T1i - T1j;
|
||||
T1m = T1j + T1i;
|
||||
io[WS(os, 2)] = T1d + T1k;
|
||||
io[WS(os, 6)] = T1m - T1l;
|
||||
io[WS(os, 8)] = T1k - T1d;
|
||||
io[WS(os, 4)] = T1l + T1m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 10, "n1_10", { 72, 12, 12, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,426 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 140 FP additions, 110 FP multiplications,
|
||||
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
|
||||
* 62 stack variables, 10 constants, and 44 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
|
||||
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
|
||||
DK(KP918985947, +0.918985947228994779780736114132655398124909697);
|
||||
DK(KP830830026, +0.830830026003772851058548298459246407048009821);
|
||||
DK(KP876768831, +0.876768831002589333891339807079336796764054852);
|
||||
DK(KP778434453, +0.778434453334651800608337670740821884709317477);
|
||||
DK(KP715370323, +0.715370323453429719112414662767260662417897278);
|
||||
DK(KP521108558, +0.521108558113202722944698153526659300680427422);
|
||||
DK(KP634356270, +0.634356270682424498893150776899916060542806975);
|
||||
DK(KP342584725, +0.342584725681637509502641509861112333758894680);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
|
||||
E T1, T1f, T4, T1u, Tg, T1q, T7, T1t, Ta, T1s, Td, T1r, Ti, TP, T26;
|
||||
E TG, T1X, T1O, T1w, TY, T1F, T17, To, T1i, TA, T1k, Tr, T1h, Tu, T1j;
|
||||
E Tx, T1g, TC, TU, T21, TL, T1S, T1J, T1m, T13, T1A, T1c;
|
||||
T1 = ri[0];
|
||||
T1f = ii[0];
|
||||
{
|
||||
E T5, T6, Tp, Tq;
|
||||
{
|
||||
E T2, T3, Te, Tf;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
T1u = T3 - T2;
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = ri[WS(is, 6)];
|
||||
Tg = Te + Tf;
|
||||
T1q = Tf - Te;
|
||||
}
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 9)];
|
||||
T7 = T5 + T6;
|
||||
T1t = T6 - T5;
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 8)];
|
||||
Ta = T8 + T9;
|
||||
T1s = T9 - T8;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 7)];
|
||||
Td = Tb + Tc;
|
||||
T1r = Tc - Tb;
|
||||
}
|
||||
{
|
||||
E Th, TO, T25, TF, T1W;
|
||||
Th = FNMS(KP342584725, Ta, T7);
|
||||
Ti = FNMS(KP634356270, Th, Td);
|
||||
TO = FNMS(KP342584725, T4, Ta);
|
||||
TP = FNMS(KP634356270, TO, Tg);
|
||||
T25 = FMA(KP521108558, T1q, T1u);
|
||||
T26 = FMA(KP715370323, T25, T1r);
|
||||
TF = FNMS(KP342584725, Td, T4);
|
||||
TG = FNMS(KP634356270, TF, T7);
|
||||
T1W = FMA(KP521108558, T1s, T1q);
|
||||
T1X = FNMS(KP715370323, T1W, T1t);
|
||||
}
|
||||
{
|
||||
E T1N, T1v, TX, T1E, T16;
|
||||
T1N = FNMS(KP521108558, T1t, T1r);
|
||||
T1O = FMA(KP715370323, T1N, T1q);
|
||||
T1v = FNMS(KP521108558, T1u, T1t);
|
||||
T1w = FNMS(KP715370323, T1v, T1s);
|
||||
TX = FNMS(KP342584725, T7, Tg);
|
||||
TY = FNMS(KP634356270, TX, T4);
|
||||
T1E = FMA(KP521108558, T1r, T1s);
|
||||
T1F = FMA(KP715370323, T1E, T1u);
|
||||
T16 = FNMS(KP342584725, Tg, Td);
|
||||
T17 = FNMS(KP634356270, T16, Ta);
|
||||
}
|
||||
{
|
||||
E Tm, Tn, Ty, Tz;
|
||||
Tm = ii[WS(is, 3)];
|
||||
Tn = ii[WS(is, 8)];
|
||||
To = Tm - Tn;
|
||||
T1i = Tm + Tn;
|
||||
Ty = ii[WS(is, 5)];
|
||||
Tz = ii[WS(is, 6)];
|
||||
TA = Ty - Tz;
|
||||
T1k = Ty + Tz;
|
||||
}
|
||||
Tp = ii[WS(is, 2)];
|
||||
Tq = ii[WS(is, 9)];
|
||||
Tr = Tp - Tq;
|
||||
T1h = Tp + Tq;
|
||||
{
|
||||
E Ts, Tt, Tv, Tw;
|
||||
Ts = ii[WS(is, 4)];
|
||||
Tt = ii[WS(is, 7)];
|
||||
Tu = Ts - Tt;
|
||||
T1j = Ts + Tt;
|
||||
Tv = ii[WS(is, 1)];
|
||||
Tw = ii[WS(is, 10)];
|
||||
Tx = Tv - Tw;
|
||||
T1g = Tv + Tw;
|
||||
}
|
||||
{
|
||||
E TB, TT, T20, TK, T1R;
|
||||
TB = FMA(KP521108558, TA, Tx);
|
||||
TC = FMA(KP715370323, TB, Tu);
|
||||
TT = FNMS(KP521108558, Tr, Tu);
|
||||
TU = FMA(KP715370323, TT, TA);
|
||||
T20 = FNMS(KP342584725, T1i, T1h);
|
||||
T21 = FNMS(KP634356270, T20, T1j);
|
||||
TK = FMA(KP521108558, To, TA);
|
||||
TL = FNMS(KP715370323, TK, Tr);
|
||||
T1R = FNMS(KP342584725, T1j, T1g);
|
||||
T1S = FNMS(KP634356270, T1R, T1h);
|
||||
}
|
||||
{
|
||||
E T1I, T1l, T12, T1z, T1b;
|
||||
T1I = FNMS(KP342584725, T1g, T1i);
|
||||
T1J = FNMS(KP634356270, T1I, T1k);
|
||||
T1l = FNMS(KP342584725, T1k, T1j);
|
||||
T1m = FNMS(KP634356270, T1l, T1i);
|
||||
T12 = FMA(KP521108558, Tu, To);
|
||||
T13 = FMA(KP715370323, T12, Tx);
|
||||
T1z = FNMS(KP342584725, T1h, T1k);
|
||||
T1A = FNMS(KP634356270, T1z, T1g);
|
||||
T1b = FNMS(KP521108558, Tx, Tr);
|
||||
T1c = FNMS(KP715370323, T1b, To);
|
||||
}
|
||||
}
|
||||
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
|
||||
io[0] = T1f + T1g + T1h + T1i + T1j + T1k;
|
||||
{
|
||||
E Tk, TE, Tj, TD, Tl;
|
||||
Tj = FNMS(KP778434453, Ti, T4);
|
||||
Tk = FNMS(KP876768831, Tj, Tg);
|
||||
TD = FMA(KP830830026, TC, Tr);
|
||||
TE = FMA(KP918985947, TD, To);
|
||||
Tl = FNMS(KP959492973, Tk, T1);
|
||||
ro[WS(os, 10)] = FNMS(KP989821441, TE, Tl);
|
||||
ro[WS(os, 1)] = FMA(KP989821441, TE, Tl);
|
||||
}
|
||||
{
|
||||
E T23, T28, T22, T27, T24;
|
||||
T22 = FNMS(KP778434453, T21, T1g);
|
||||
T23 = FNMS(KP876768831, T22, T1k);
|
||||
T27 = FMA(KP830830026, T26, T1t);
|
||||
T28 = FMA(KP918985947, T27, T1s);
|
||||
T24 = FNMS(KP959492973, T23, T1f);
|
||||
io[WS(os, 1)] = FMA(KP989821441, T28, T24);
|
||||
io[WS(os, 10)] = FNMS(KP989821441, T28, T24);
|
||||
}
|
||||
{
|
||||
E T1U, T1Z, T1T, T1Y, T1V;
|
||||
T1T = FNMS(KP778434453, T1S, T1k);
|
||||
T1U = FNMS(KP876768831, T1T, T1i);
|
||||
T1Y = FMA(KP830830026, T1X, T1u);
|
||||
T1Z = FNMS(KP918985947, T1Y, T1r);
|
||||
T1V = FNMS(KP959492973, T1U, T1f);
|
||||
io[WS(os, 2)] = FNMS(KP989821441, T1Z, T1V);
|
||||
io[WS(os, 9)] = FMA(KP989821441, T1Z, T1V);
|
||||
}
|
||||
{
|
||||
E TI, TN, TH, TM, TJ;
|
||||
TH = FNMS(KP778434453, TG, Tg);
|
||||
TI = FNMS(KP876768831, TH, Ta);
|
||||
TM = FMA(KP830830026, TL, Tx);
|
||||
TN = FNMS(KP918985947, TM, Tu);
|
||||
TJ = FNMS(KP959492973, TI, T1);
|
||||
ro[WS(os, 2)] = FNMS(KP989821441, TN, TJ);
|
||||
ro[WS(os, 9)] = FMA(KP989821441, TN, TJ);
|
||||
}
|
||||
{
|
||||
E TR, TW, TQ, TV, TS;
|
||||
TQ = FNMS(KP778434453, TP, Td);
|
||||
TR = FNMS(KP876768831, TQ, T7);
|
||||
TV = FNMS(KP830830026, TU, To);
|
||||
TW = FNMS(KP918985947, TV, Tx);
|
||||
TS = FNMS(KP959492973, TR, T1);
|
||||
ro[WS(os, 8)] = FNMS(KP989821441, TW, TS);
|
||||
ro[WS(os, 3)] = FMA(KP989821441, TW, TS);
|
||||
}
|
||||
{
|
||||
E T1L, T1Q, T1K, T1P, T1M;
|
||||
T1K = FNMS(KP778434453, T1J, T1j);
|
||||
T1L = FNMS(KP876768831, T1K, T1h);
|
||||
T1P = FNMS(KP830830026, T1O, T1s);
|
||||
T1Q = FNMS(KP918985947, T1P, T1u);
|
||||
T1M = FNMS(KP959492973, T1L, T1f);
|
||||
io[WS(os, 3)] = FMA(KP989821441, T1Q, T1M);
|
||||
io[WS(os, 8)] = FNMS(KP989821441, T1Q, T1M);
|
||||
}
|
||||
{
|
||||
E T10, T15, TZ, T14, T11;
|
||||
TZ = FNMS(KP778434453, TY, Ta);
|
||||
T10 = FNMS(KP876768831, TZ, Td);
|
||||
T14 = FNMS(KP830830026, T13, TA);
|
||||
T15 = FMA(KP918985947, T14, Tr);
|
||||
T11 = FNMS(KP959492973, T10, T1);
|
||||
ro[WS(os, 4)] = FNMS(KP989821441, T15, T11);
|
||||
ro[WS(os, 7)] = FMA(KP989821441, T15, T11);
|
||||
}
|
||||
{
|
||||
E T1C, T1H, T1B, T1G, T1D;
|
||||
T1B = FNMS(KP778434453, T1A, T1i);
|
||||
T1C = FNMS(KP876768831, T1B, T1j);
|
||||
T1G = FNMS(KP830830026, T1F, T1q);
|
||||
T1H = FMA(KP918985947, T1G, T1t);
|
||||
T1D = FNMS(KP959492973, T1C, T1f);
|
||||
io[WS(os, 4)] = FNMS(KP989821441, T1H, T1D);
|
||||
io[WS(os, 7)] = FMA(KP989821441, T1H, T1D);
|
||||
}
|
||||
{
|
||||
E T1o, T1y, T1n, T1x, T1p;
|
||||
T1n = FNMS(KP778434453, T1m, T1h);
|
||||
T1o = FNMS(KP876768831, T1n, T1g);
|
||||
T1x = FNMS(KP830830026, T1w, T1r);
|
||||
T1y = FNMS(KP918985947, T1x, T1q);
|
||||
T1p = FNMS(KP959492973, T1o, T1f);
|
||||
io[WS(os, 5)] = FMA(KP989821441, T1y, T1p);
|
||||
io[WS(os, 6)] = FNMS(KP989821441, T1y, T1p);
|
||||
}
|
||||
{
|
||||
E T19, T1e, T18, T1d, T1a;
|
||||
T18 = FNMS(KP778434453, T17, T7);
|
||||
T19 = FNMS(KP876768831, T18, T4);
|
||||
T1d = FNMS(KP830830026, T1c, Tu);
|
||||
T1e = FNMS(KP918985947, T1d, TA);
|
||||
T1a = FNMS(KP959492973, T19, T1);
|
||||
ro[WS(os, 6)] = FNMS(KP989821441, T1e, T1a);
|
||||
ro[WS(os, 5)] = FMA(KP989821441, T1e, T1a);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 11, "n1_11", { 30, 0, 110, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 140 FP additions, 100 FP multiplications,
|
||||
* (or, 60 additions, 20 multiplications, 80 fused multiply/add),
|
||||
* 41 stack variables, 10 constants, and 44 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP654860733, +0.654860733945285064056925072466293553183791199);
|
||||
DK(KP142314838, +0.142314838273285140443792668616369668791051361);
|
||||
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
|
||||
DK(KP415415013, +0.415415013001886425529274149229623203524004910);
|
||||
DK(KP841253532, +0.841253532831181168861811648919367717513292498);
|
||||
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
|
||||
DK(KP909631995, +0.909631995354518371411715383079028460060241051);
|
||||
DK(KP281732556, +0.281732556841429697711417915346616899035777899);
|
||||
DK(KP540640817, +0.540640817455597582107635954318691695431770608);
|
||||
DK(KP755749574, +0.755749574354258283774035843972344420179717445);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
|
||||
E T1, TM, T4, TG, Tk, TR, Tw, TN, T7, TK, Ta, TH, Tn, TQ, Td;
|
||||
E TJ, Tq, TO, Tt, TP, Tg, TI;
|
||||
{
|
||||
E T2, T3, Ti, Tj;
|
||||
T1 = ri[0];
|
||||
TM = ii[0];
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
TG = T3 - T2;
|
||||
Ti = ii[WS(is, 1)];
|
||||
Tj = ii[WS(is, 10)];
|
||||
Tk = Ti - Tj;
|
||||
TR = Ti + Tj;
|
||||
{
|
||||
E Tu, Tv, T5, T6;
|
||||
Tu = ii[WS(is, 2)];
|
||||
Tv = ii[WS(is, 9)];
|
||||
Tw = Tu - Tv;
|
||||
TN = Tu + Tv;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 9)];
|
||||
T7 = T5 + T6;
|
||||
TK = T6 - T5;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T8, T9, To, Tp;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 8)];
|
||||
Ta = T8 + T9;
|
||||
TH = T9 - T8;
|
||||
{
|
||||
E Tl, Tm, Tb, Tc;
|
||||
Tl = ii[WS(is, 3)];
|
||||
Tm = ii[WS(is, 8)];
|
||||
Tn = Tl - Tm;
|
||||
TQ = Tl + Tm;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 7)];
|
||||
Td = Tb + Tc;
|
||||
TJ = Tc - Tb;
|
||||
}
|
||||
To = ii[WS(is, 4)];
|
||||
Tp = ii[WS(is, 7)];
|
||||
Tq = To - Tp;
|
||||
TO = To + Tp;
|
||||
{
|
||||
E Tr, Ts, Te, Tf;
|
||||
Tr = ii[WS(is, 5)];
|
||||
Ts = ii[WS(is, 6)];
|
||||
Tt = Tr - Ts;
|
||||
TP = Tr + Ts;
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = ri[WS(is, 6)];
|
||||
Tg = Te + Tf;
|
||||
TI = Tf - Te;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tx, Th, TZ, T10;
|
||||
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
|
||||
io[0] = TM + TR + TN + TQ + TO + TP;
|
||||
Tx = FMA(KP755749574, Tk, KP540640817 * Tn) + FNMS(KP909631995, Tt, KP281732556 * Tq) - (KP989821441 * Tw);
|
||||
Th = FMA(KP841253532, Ta, T1) + FNMS(KP959492973, Td, KP415415013 * Tg) + FNMA(KP142314838, T7, KP654860733 * T4);
|
||||
ro[WS(os, 7)] = Th - Tx;
|
||||
ro[WS(os, 4)] = Th + Tx;
|
||||
TZ = FMA(KP755749574, TG, KP540640817 * TH) + FNMS(KP909631995, TI, KP281732556 * TJ) - (KP989821441 * TK);
|
||||
T10 = FMA(KP841253532, TQ, TM) + FNMS(KP959492973, TO, KP415415013 * TP) + FNMA(KP142314838, TN, KP654860733 * TR);
|
||||
io[WS(os, 4)] = TZ + T10;
|
||||
io[WS(os, 7)] = T10 - TZ;
|
||||
{
|
||||
E TX, TY, Tz, Ty;
|
||||
TX = FMA(KP909631995, TG, KP755749574 * TK) + FNMA(KP540640817, TI, KP989821441 * TJ) - (KP281732556 * TH);
|
||||
TY = FMA(KP415415013, TR, TM) + FNMS(KP142314838, TO, KP841253532 * TP) + FNMA(KP959492973, TQ, KP654860733 * TN);
|
||||
io[WS(os, 2)] = TX + TY;
|
||||
io[WS(os, 9)] = TY - TX;
|
||||
Tz = FMA(KP909631995, Tk, KP755749574 * Tw) + FNMA(KP540640817, Tt, KP989821441 * Tq) - (KP281732556 * Tn);
|
||||
Ty = FMA(KP415415013, T4, T1) + FNMS(KP142314838, Td, KP841253532 * Tg) + FNMA(KP959492973, Ta, KP654860733 * T7);
|
||||
ro[WS(os, 9)] = Ty - Tz;
|
||||
ro[WS(os, 2)] = Ty + Tz;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, TA, TT, TU;
|
||||
TB = FMA(KP540640817, Tk, KP909631995 * Tw) + FMA(KP989821441, Tn, KP755749574 * Tq) + (KP281732556 * Tt);
|
||||
TA = FMA(KP841253532, T4, T1) + FNMS(KP959492973, Tg, KP415415013 * T7) + FNMA(KP654860733, Td, KP142314838 * Ta);
|
||||
ro[WS(os, 10)] = TA - TB;
|
||||
ro[WS(os, 1)] = TA + TB;
|
||||
{
|
||||
E TV, TW, TD, TC;
|
||||
TV = FMA(KP540640817, TG, KP909631995 * TK) + FMA(KP989821441, TH, KP755749574 * TJ) + (KP281732556 * TI);
|
||||
TW = FMA(KP841253532, TR, TM) + FNMS(KP959492973, TP, KP415415013 * TN) + FNMA(KP654860733, TO, KP142314838 * TQ);
|
||||
io[WS(os, 1)] = TV + TW;
|
||||
io[WS(os, 10)] = TW - TV;
|
||||
TD = FMA(KP989821441, Tk, KP540640817 * Tq) + FNMS(KP909631995, Tn, KP755749574 * Tt) - (KP281732556 * Tw);
|
||||
TC = FMA(KP415415013, Ta, T1) + FNMS(KP654860733, Tg, KP841253532 * Td) + FNMA(KP959492973, T7, KP142314838 * T4);
|
||||
ro[WS(os, 8)] = TC - TD;
|
||||
ro[WS(os, 3)] = TC + TD;
|
||||
}
|
||||
TT = FMA(KP989821441, TG, KP540640817 * TJ) + FNMS(KP909631995, TH, KP755749574 * TI) - (KP281732556 * TK);
|
||||
TU = FMA(KP415415013, TQ, TM) + FNMS(KP654860733, TP, KP841253532 * TO) + FNMA(KP959492973, TN, KP142314838 * TR);
|
||||
io[WS(os, 3)] = TT + TU;
|
||||
io[WS(os, 8)] = TU - TT;
|
||||
{
|
||||
E TL, TS, TF, TE;
|
||||
TL = FMA(KP281732556, TG, KP755749574 * TH) + FNMS(KP909631995, TJ, KP989821441 * TI) - (KP540640817 * TK);
|
||||
TS = FMA(KP841253532, TN, TM) + FNMS(KP142314838, TP, KP415415013 * TO) + FNMA(KP654860733, TQ, KP959492973 * TR);
|
||||
io[WS(os, 5)] = TL + TS;
|
||||
io[WS(os, 6)] = TS - TL;
|
||||
TF = FMA(KP281732556, Tk, KP755749574 * Tn) + FNMS(KP909631995, Tq, KP989821441 * Tt) - (KP540640817 * Tw);
|
||||
TE = FMA(KP841253532, T7, T1) + FNMS(KP142314838, Tg, KP415415013 * Td) + FNMA(KP654860733, Ta, KP959492973 * T4);
|
||||
ro[WS(os, 6)] = TE - TF;
|
||||
ro[WS(os, 5)] = TE + TF;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 11, "n1_11", { 60, 20, 80, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,420 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 24 FP multiplications,
|
||||
* (or, 72 additions, 0 multiplications, 24 fused multiply/add),
|
||||
* 43 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
|
||||
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1d, TG;
|
||||
E TJ, T1u, T1c, Tl, T1i, TL, TO, T1v, T1h;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = ri[WS(is, 8)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TR = FNMS(KP500000000, T4, T1);
|
||||
TA = T3 - T2;
|
||||
}
|
||||
{
|
||||
E To, Tp, Tq, Tr;
|
||||
To = ii[0];
|
||||
Tp = ii[WS(is, 4)];
|
||||
Tq = ii[WS(is, 8)];
|
||||
Tr = Tp + Tq;
|
||||
Ts = To + Tr;
|
||||
TS = Tp - Tq;
|
||||
Tz = FNMS(KP500000000, Tr, To);
|
||||
}
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = ri[WS(is, 6)];
|
||||
T7 = ri[WS(is, 10)];
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
TU = FNMS(KP500000000, T9, T6);
|
||||
TD = T8 - T7;
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tv, Tw;
|
||||
Tt = ii[WS(is, 6)];
|
||||
Tu = ii[WS(is, 10)];
|
||||
Tv = ii[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
Tx = Tt + Tw;
|
||||
TV = Tu - Tv;
|
||||
TC = FNMS(KP500000000, Tw, Tt);
|
||||
}
|
||||
{
|
||||
E Tc, Td, Te, Tf;
|
||||
Tc = ri[WS(is, 3)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = ri[WS(is, 11)];
|
||||
Tf = Td + Te;
|
||||
Tg = Tc + Tf;
|
||||
T1d = Te - Td;
|
||||
TG = FNMS(KP500000000, Tf, Tc);
|
||||
}
|
||||
{
|
||||
E T1a, TH, TI, T1b;
|
||||
T1a = ii[WS(is, 3)];
|
||||
TH = ii[WS(is, 7)];
|
||||
TI = ii[WS(is, 11)];
|
||||
T1b = TH + TI;
|
||||
TJ = TH - TI;
|
||||
T1u = T1a + T1b;
|
||||
T1c = FNMS(KP500000000, T1b, T1a);
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = Th + Tk;
|
||||
T1i = Tj - Ti;
|
||||
TL = FNMS(KP500000000, Tk, Th);
|
||||
}
|
||||
{
|
||||
E T1f, TM, TN, T1g;
|
||||
T1f = ii[WS(is, 9)];
|
||||
TM = ii[WS(is, 1)];
|
||||
TN = ii[WS(is, 5)];
|
||||
T1g = TM + TN;
|
||||
TO = TM - TN;
|
||||
T1v = T1f + T1g;
|
||||
T1h = FNMS(KP500000000, T1g, T1f);
|
||||
}
|
||||
{
|
||||
E Tb, Tm, T1t, T1w;
|
||||
Tb = T5 + Ta;
|
||||
Tm = Tg + Tl;
|
||||
ro[WS(os, 6)] = Tb - Tm;
|
||||
ro[0] = Tb + Tm;
|
||||
{
|
||||
E T1x, T1y, Tn, Ty;
|
||||
T1x = Ts + Tx;
|
||||
T1y = T1u + T1v;
|
||||
io[WS(os, 6)] = T1x - T1y;
|
||||
io[0] = T1x + T1y;
|
||||
Tn = Tg - Tl;
|
||||
Ty = Ts - Tx;
|
||||
io[WS(os, 3)] = Tn + Ty;
|
||||
io[WS(os, 9)] = Ty - Tn;
|
||||
}
|
||||
T1t = T5 - Ta;
|
||||
T1w = T1u - T1v;
|
||||
ro[WS(os, 3)] = T1t - T1w;
|
||||
ro[WS(os, 9)] = T1t + T1w;
|
||||
{
|
||||
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
|
||||
{
|
||||
E TZ, T10, T1e, T1j;
|
||||
TZ = FMA(KP866025403, TA, Tz);
|
||||
T10 = FMA(KP866025403, TD, TC);
|
||||
T11 = TZ - T10;
|
||||
T1l = TZ + T10;
|
||||
T1e = FMA(KP866025403, T1d, T1c);
|
||||
T1j = FMA(KP866025403, T1i, T1h);
|
||||
T1k = T1e - T1j;
|
||||
T1m = T1e + T1j;
|
||||
}
|
||||
{
|
||||
E T12, T13, T15, T16;
|
||||
T12 = FMA(KP866025403, TJ, TG);
|
||||
T13 = FMA(KP866025403, TO, TL);
|
||||
T14 = T12 - T13;
|
||||
T18 = T12 + T13;
|
||||
T15 = FMA(KP866025403, TS, TR);
|
||||
T16 = FMA(KP866025403, TV, TU);
|
||||
T17 = T15 + T16;
|
||||
T19 = T15 - T16;
|
||||
}
|
||||
io[WS(os, 1)] = T11 - T14;
|
||||
ro[WS(os, 1)] = T19 + T1k;
|
||||
io[WS(os, 7)] = T11 + T14;
|
||||
ro[WS(os, 7)] = T19 - T1k;
|
||||
ro[WS(os, 10)] = T17 - T18;
|
||||
io[WS(os, 10)] = T1l - T1m;
|
||||
ro[WS(os, 4)] = T17 + T18;
|
||||
io[WS(os, 4)] = T1l + T1m;
|
||||
}
|
||||
{
|
||||
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
|
||||
{
|
||||
E TB, TE, T1o, T1p;
|
||||
TB = FNMS(KP866025403, TA, Tz);
|
||||
TE = FNMS(KP866025403, TD, TC);
|
||||
TF = TB - TE;
|
||||
T1r = TB + TE;
|
||||
T1o = FNMS(KP866025403, T1d, T1c);
|
||||
T1p = FNMS(KP866025403, T1i, T1h);
|
||||
T1q = T1o - T1p;
|
||||
T1s = T1o + T1p;
|
||||
}
|
||||
{
|
||||
E TK, TP, TT, TW;
|
||||
TK = FNMS(KP866025403, TJ, TG);
|
||||
TP = FNMS(KP866025403, TO, TL);
|
||||
TQ = TK - TP;
|
||||
TY = TK + TP;
|
||||
TT = FNMS(KP866025403, TS, TR);
|
||||
TW = FNMS(KP866025403, TV, TU);
|
||||
TX = TT + TW;
|
||||
T1n = TT - TW;
|
||||
}
|
||||
io[WS(os, 5)] = TF - TQ;
|
||||
ro[WS(os, 5)] = T1n + T1q;
|
||||
io[WS(os, 11)] = TF + TQ;
|
||||
ro[WS(os, 11)] = T1n - T1q;
|
||||
ro[WS(os, 2)] = TX - TY;
|
||||
io[WS(os, 2)] = T1r - T1s;
|
||||
ro[WS(os, 8)] = TX + TY;
|
||||
io[WS(os, 8)] = T1r + T1s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 12, "n1_12", { 72, 0, 24, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 16 FP multiplications,
|
||||
* (or, 88 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 43 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
|
||||
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
|
||||
E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = ri[WS(is, 8)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TR = FNMS(KP500000000, T4, T1);
|
||||
TA = KP866025403 * (T3 - T2);
|
||||
}
|
||||
{
|
||||
E To, Tp, Tq, Tr;
|
||||
To = ii[0];
|
||||
Tp = ii[WS(is, 4)];
|
||||
Tq = ii[WS(is, 8)];
|
||||
Tr = Tp + Tq;
|
||||
Ts = To + Tr;
|
||||
TS = KP866025403 * (Tp - Tq);
|
||||
Tz = FNMS(KP500000000, Tr, To);
|
||||
}
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = ri[WS(is, 6)];
|
||||
T7 = ri[WS(is, 10)];
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
TU = FNMS(KP500000000, T9, T6);
|
||||
TD = KP866025403 * (T8 - T7);
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tv, Tw;
|
||||
Tt = ii[WS(is, 6)];
|
||||
Tu = ii[WS(is, 10)];
|
||||
Tv = ii[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
Tx = Tt + Tw;
|
||||
TV = KP866025403 * (Tu - Tv);
|
||||
TC = FNMS(KP500000000, Tw, Tt);
|
||||
}
|
||||
{
|
||||
E Tc, Td, Te, Tf;
|
||||
Tc = ri[WS(is, 3)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = ri[WS(is, 11)];
|
||||
Tf = Td + Te;
|
||||
Tg = Tc + Tf;
|
||||
T1a = KP866025403 * (Te - Td);
|
||||
TG = FNMS(KP500000000, Tf, Tc);
|
||||
}
|
||||
{
|
||||
E T1b, TH, TI, T1c;
|
||||
T1b = ii[WS(is, 3)];
|
||||
TH = ii[WS(is, 7)];
|
||||
TI = ii[WS(is, 11)];
|
||||
T1c = TH + TI;
|
||||
TJ = KP866025403 * (TH - TI);
|
||||
T1u = T1b + T1c;
|
||||
T1d = FNMS(KP500000000, T1c, T1b);
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = Th + Tk;
|
||||
T1f = KP866025403 * (Tj - Ti);
|
||||
TL = FNMS(KP500000000, Tk, Th);
|
||||
}
|
||||
{
|
||||
E T1g, TM, TN, T1h;
|
||||
T1g = ii[WS(is, 9)];
|
||||
TM = ii[WS(is, 1)];
|
||||
TN = ii[WS(is, 5)];
|
||||
T1h = TM + TN;
|
||||
TO = KP866025403 * (TM - TN);
|
||||
T1v = T1g + T1h;
|
||||
T1i = FNMS(KP500000000, T1h, T1g);
|
||||
}
|
||||
{
|
||||
E Tb, Tm, T1t, T1w;
|
||||
Tb = T5 + Ta;
|
||||
Tm = Tg + Tl;
|
||||
ro[WS(os, 6)] = Tb - Tm;
|
||||
ro[0] = Tb + Tm;
|
||||
{
|
||||
E T1x, T1y, Tn, Ty;
|
||||
T1x = Ts + Tx;
|
||||
T1y = T1u + T1v;
|
||||
io[WS(os, 6)] = T1x - T1y;
|
||||
io[0] = T1x + T1y;
|
||||
Tn = Tg - Tl;
|
||||
Ty = Ts - Tx;
|
||||
io[WS(os, 3)] = Tn + Ty;
|
||||
io[WS(os, 9)] = Ty - Tn;
|
||||
}
|
||||
T1t = T5 - Ta;
|
||||
T1w = T1u - T1v;
|
||||
ro[WS(os, 3)] = T1t - T1w;
|
||||
ro[WS(os, 9)] = T1t + T1w;
|
||||
{
|
||||
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
|
||||
{
|
||||
E TZ, T10, T1e, T1j;
|
||||
TZ = TA + Tz;
|
||||
T10 = TD + TC;
|
||||
T11 = TZ - T10;
|
||||
T1l = TZ + T10;
|
||||
T1e = T1a + T1d;
|
||||
T1j = T1f + T1i;
|
||||
T1k = T1e - T1j;
|
||||
T1m = T1e + T1j;
|
||||
}
|
||||
{
|
||||
E T12, T13, T15, T16;
|
||||
T12 = TG + TJ;
|
||||
T13 = TL + TO;
|
||||
T14 = T12 - T13;
|
||||
T18 = T12 + T13;
|
||||
T15 = TR + TS;
|
||||
T16 = TU + TV;
|
||||
T17 = T15 + T16;
|
||||
T19 = T15 - T16;
|
||||
}
|
||||
io[WS(os, 1)] = T11 - T14;
|
||||
ro[WS(os, 1)] = T19 + T1k;
|
||||
io[WS(os, 7)] = T11 + T14;
|
||||
ro[WS(os, 7)] = T19 - T1k;
|
||||
ro[WS(os, 10)] = T17 - T18;
|
||||
io[WS(os, 10)] = T1l - T1m;
|
||||
ro[WS(os, 4)] = T17 + T18;
|
||||
io[WS(os, 4)] = T1l + T1m;
|
||||
}
|
||||
{
|
||||
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
|
||||
{
|
||||
E TB, TE, T1o, T1p;
|
||||
TB = Tz - TA;
|
||||
TE = TC - TD;
|
||||
TF = TB - TE;
|
||||
T1r = TB + TE;
|
||||
T1o = T1d - T1a;
|
||||
T1p = T1i - T1f;
|
||||
T1q = T1o - T1p;
|
||||
T1s = T1o + T1p;
|
||||
}
|
||||
{
|
||||
E TK, TP, TT, TW;
|
||||
TK = TG - TJ;
|
||||
TP = TL - TO;
|
||||
TQ = TK - TP;
|
||||
TY = TK + TP;
|
||||
TT = TR - TS;
|
||||
TW = TU - TV;
|
||||
TX = TT + TW;
|
||||
T1n = TT - TW;
|
||||
}
|
||||
io[WS(os, 5)] = TF - TQ;
|
||||
ro[WS(os, 5)] = T1n + T1q;
|
||||
io[WS(os, 11)] = TF + TQ;
|
||||
ro[WS(os, 11)] = T1n - T1q;
|
||||
ro[WS(os, 2)] = TX - TY;
|
||||
io[WS(os, 2)] = T1r - T1s;
|
||||
ro[WS(os, 8)] = TX + TY;
|
||||
io[WS(os, 8)] = T1r + T1s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 12, "n1_12", { 88, 8, 8, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,681 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 176 FP additions, 114 FP multiplications,
|
||||
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
|
||||
* 76 stack variables, 25 constants, and 52 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP875502302, +0.875502302409147941146295545768755143177842006);
|
||||
DK(KP520028571, +0.520028571888864619117130500499232802493238139);
|
||||
DK(KP968287244, +0.968287244361984016049539446938120421179794516);
|
||||
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
|
||||
DK(KP600477271, +0.600477271932665282925769253334763009352012849);
|
||||
DK(KP957805992, +0.957805992594665126462521754605754580515587217);
|
||||
DK(KP516520780, +0.516520780623489722840901288569017135705033622);
|
||||
DK(KP581704778, +0.581704778510515730456870384989698884939833902);
|
||||
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
|
||||
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
|
||||
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
|
||||
DK(KP301479260, +0.301479260047709873958013540496673347309208464);
|
||||
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
|
||||
DK(KP859542535, +0.859542535098774820163672132761689612766401925);
|
||||
DK(KP514918778, +0.514918778086315755491789696138117261566051239);
|
||||
DK(KP522026385, +0.522026385161275033714027226654165028300441940);
|
||||
DK(KP853480001, +0.853480001859823990758994934970528322872359049);
|
||||
DK(KP612264650, +0.612264650376756543746494474777125408779395514);
|
||||
DK(KP038632954, +0.038632954644348171955506895830342264440241080);
|
||||
DK(KP302775637, +0.302775637731994646559610633735247973125648287);
|
||||
DK(KP769338817, +0.769338817572980603471413688209101117038278899);
|
||||
DK(KP686558370, +0.686558370781754340655719594850823015421401653);
|
||||
DK(KP226109445, +0.226109445035782405468510155372505010481906348);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
|
||||
E T1, T1P, T2n, T2o, To, TH, T2h, T2k, TB, TE, Tw, TF, T2c, T2j, T1j;
|
||||
E T1m, T12, T1f, T21, T24, T1U, T27, T1d, T1g, T1Y, T25;
|
||||
T1 = ri[0];
|
||||
T1P = ii[0];
|
||||
{
|
||||
E Tf, T2d, Tb, Ty, Tq, T6, Tx, Tr, Ti, Tt, Tl, Tu, Tm, T2e, Td;
|
||||
E Te, Tc, Tn;
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = Td + Te;
|
||||
T2d = Td - Te;
|
||||
{
|
||||
E T7, T8, T9, Ta;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 10)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
Tb = T7 + Ta;
|
||||
Ty = FMS(KP500000000, Ta, T7);
|
||||
Tq = T8 - T9;
|
||||
}
|
||||
{
|
||||
E T2, T3, T4, T5;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 3)];
|
||||
T4 = ri[WS(is, 9)];
|
||||
T5 = T3 + T4;
|
||||
T6 = T2 + T5;
|
||||
Tx = FNMS(KP500000000, T5, T2);
|
||||
Tr = T4 - T3;
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ri[WS(is, 11)];
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = Tg + Th;
|
||||
Tt = Tg - Th;
|
||||
Tj = ri[WS(is, 7)];
|
||||
Tk = ri[WS(is, 2)];
|
||||
Tl = Tj + Tk;
|
||||
Tu = Tj - Tk;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T2e = Tt + Tu;
|
||||
T2n = T6 - Tb;
|
||||
T2o = T2d + T2e;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Tf + Tm;
|
||||
To = Tc + Tn;
|
||||
TH = Tc - Tn;
|
||||
{
|
||||
E T2f, T2g, Tz, TA;
|
||||
T2f = FNMS(KP500000000, T2e, T2d);
|
||||
T2g = Tr + Tq;
|
||||
T2h = FMA(KP866025403, T2g, T2f);
|
||||
T2k = FNMS(KP866025403, T2g, T2f);
|
||||
Tz = Tx - Ty;
|
||||
TA = FNMS(KP500000000, Tm, Tf);
|
||||
TB = Tz + TA;
|
||||
TE = Tz - TA;
|
||||
}
|
||||
{
|
||||
E Ts, Tv, T2a, T2b;
|
||||
Ts = Tq - Tr;
|
||||
Tv = Tt - Tu;
|
||||
Tw = Ts + Tv;
|
||||
TF = Ts - Tv;
|
||||
T2a = Tx + Ty;
|
||||
T2b = Ti - Tl;
|
||||
T2c = FMA(KP866025403, T2b, T2a);
|
||||
T2j = FNMS(KP866025403, T2b, T2a);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, T1R, T10, T1l, T18, TX, T1k, T15, TP, T1a, TS, T1b, TT, T1S, TK;
|
||||
E TL, TU, T11;
|
||||
TK = ii[WS(is, 8)];
|
||||
TL = ii[WS(is, 5)];
|
||||
TM = TK - TL;
|
||||
T1R = TK + TL;
|
||||
{
|
||||
E T16, TY, TZ, T17;
|
||||
T16 = ii[WS(is, 12)];
|
||||
TY = ii[WS(is, 10)];
|
||||
TZ = ii[WS(is, 4)];
|
||||
T17 = TY + TZ;
|
||||
T10 = TY - TZ;
|
||||
T1l = T16 + T17;
|
||||
T18 = FMS(KP500000000, T17, T16);
|
||||
}
|
||||
{
|
||||
E T13, TV, TW, T14;
|
||||
T13 = ii[WS(is, 1)];
|
||||
TV = ii[WS(is, 9)];
|
||||
TW = ii[WS(is, 3)];
|
||||
T14 = TW + TV;
|
||||
TX = TV - TW;
|
||||
T1k = T13 + T14;
|
||||
T15 = FNMS(KP500000000, T14, T13);
|
||||
}
|
||||
{
|
||||
E TN, TO, TQ, TR;
|
||||
TN = ii[WS(is, 11)];
|
||||
TO = ii[WS(is, 6)];
|
||||
TP = TN - TO;
|
||||
T1a = TN + TO;
|
||||
TQ = ii[WS(is, 7)];
|
||||
TR = ii[WS(is, 2)];
|
||||
TS = TQ - TR;
|
||||
T1b = TQ + TR;
|
||||
}
|
||||
TT = TP + TS;
|
||||
T1S = T1a + T1b;
|
||||
T1j = TM + TT;
|
||||
T1m = T1k - T1l;
|
||||
TU = FNMS(KP500000000, TT, TM);
|
||||
T11 = TX + T10;
|
||||
T12 = FMA(KP866025403, T11, TU);
|
||||
T1f = FNMS(KP866025403, T11, TU);
|
||||
{
|
||||
E T1Z, T20, T1Q, T1T;
|
||||
T1Z = T15 - T18;
|
||||
T20 = FNMS(KP500000000, T1S, T1R);
|
||||
T21 = T1Z + T20;
|
||||
T24 = T1Z - T20;
|
||||
T1Q = T1k + T1l;
|
||||
T1T = T1R + T1S;
|
||||
T1U = T1Q + T1T;
|
||||
T27 = T1Q - T1T;
|
||||
}
|
||||
{
|
||||
E T19, T1c, T1W, T1X;
|
||||
T19 = T15 + T18;
|
||||
T1c = T1a - T1b;
|
||||
T1d = FMA(KP866025403, T1c, T19);
|
||||
T1g = FNMS(KP866025403, T1c, T19);
|
||||
T1W = T10 - TX;
|
||||
T1X = TP - TS;
|
||||
T1Y = T1W + T1X;
|
||||
T25 = T1W - T1X;
|
||||
}
|
||||
}
|
||||
ro[0] = T1 + To;
|
||||
io[0] = T1P + T1U;
|
||||
{
|
||||
E T1z, T1J, T1G, T1H, T1w, T1I, T1n, T1i, T1s, T1E, TD, T1D, TI, T1r, T1e;
|
||||
E T1h;
|
||||
{
|
||||
E T1x, T1y, T1u, T1v;
|
||||
T1x = FNMS(KP226109445, Tw, TB);
|
||||
T1y = FMA(KP686558370, TE, TF);
|
||||
T1z = FNMS(KP769338817, T1y, T1x);
|
||||
T1J = FMA(KP769338817, T1y, T1x);
|
||||
T1G = FMA(KP302775637, T1j, T1m);
|
||||
T1u = FNMS(KP038632954, T12, T1d);
|
||||
T1v = FNMS(KP612264650, T1f, T1g);
|
||||
T1H = FNMS(KP853480001, T1v, T1u);
|
||||
T1w = FMA(KP853480001, T1v, T1u);
|
||||
T1I = FNMS(KP522026385, T1H, T1G);
|
||||
}
|
||||
T1n = FNMS(KP302775637, T1m, T1j);
|
||||
T1e = FMA(KP038632954, T1d, T12);
|
||||
T1h = FMA(KP612264650, T1g, T1f);
|
||||
T1i = FNMS(KP853480001, T1h, T1e);
|
||||
T1s = FNMS(KP522026385, T1i, T1n);
|
||||
T1E = FMA(KP853480001, T1h, T1e);
|
||||
{
|
||||
E TG, T1q, Tp, TC, T1p;
|
||||
TG = FNMS(KP514918778, TF, TE);
|
||||
T1q = FNMS(KP859542535, TG, TH);
|
||||
Tp = FNMS(KP083333333, To, T1);
|
||||
TC = FMA(KP301479260, TB, Tw);
|
||||
T1p = FNMS(KP251768516, TC, Tp);
|
||||
TD = FMA(KP503537032, TC, Tp);
|
||||
T1D = FNMS(KP300462606, T1q, T1p);
|
||||
TI = FMA(KP581704778, TH, TG);
|
||||
T1r = FMA(KP300462606, T1q, T1p);
|
||||
}
|
||||
{
|
||||
E TJ, T1o, T1L, T1M;
|
||||
TJ = FMA(KP516520780, TI, TD);
|
||||
T1o = FMA(KP957805992, T1n, T1i);
|
||||
ro[WS(os, 1)] = FNMS(KP600477271, T1o, TJ);
|
||||
ro[WS(os, 12)] = FMA(KP600477271, T1o, TJ);
|
||||
{
|
||||
E T1t, T1A, T1N, T1O;
|
||||
T1t = FNMS(KP575140729, T1s, T1r);
|
||||
T1A = FMA(KP968287244, T1z, T1w);
|
||||
ro[WS(os, 9)] = FNMS(KP520028571, T1A, T1t);
|
||||
ro[WS(os, 3)] = FMA(KP520028571, T1A, T1t);
|
||||
T1N = FNMS(KP516520780, TI, TD);
|
||||
T1O = FMA(KP957805992, T1G, T1H);
|
||||
ro[WS(os, 8)] = FNMS(KP600477271, T1O, T1N);
|
||||
ro[WS(os, 5)] = FMA(KP600477271, T1O, T1N);
|
||||
}
|
||||
T1L = FNMS(KP520028571, T1E, T1D);
|
||||
T1M = FNMS(KP875502302, T1J, T1I);
|
||||
ro[WS(os, 11)] = FNMS(KP575140729, T1M, T1L);
|
||||
ro[WS(os, 6)] = FMA(KP575140729, T1M, T1L);
|
||||
{
|
||||
E T1F, T1K, T1B, T1C;
|
||||
T1F = FMA(KP520028571, T1E, T1D);
|
||||
T1K = FMA(KP875502302, T1J, T1I);
|
||||
ro[WS(os, 7)] = FNMS(KP575140729, T1K, T1F);
|
||||
ro[WS(os, 2)] = FMA(KP575140729, T1K, T1F);
|
||||
T1B = FMA(KP575140729, T1s, T1r);
|
||||
T1C = FNMS(KP968287244, T1z, T1w);
|
||||
ro[WS(os, 10)] = FNMS(KP520028571, T1C, T1B);
|
||||
ro[WS(os, 4)] = FMA(KP520028571, T1C, T1B);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2F, T2N, T2v, T2u, T2A, T2K, T2p, T2m, T2C, T2M, T23, T2J, T28, T2z, T2i;
|
||||
E T2l;
|
||||
{
|
||||
E T2D, T2E, T2s, T2t;
|
||||
T2D = FNMS(KP226109445, T1Y, T21);
|
||||
T2E = FMA(KP686558370, T24, T25);
|
||||
T2F = FNMS(KP769338817, T2E, T2D);
|
||||
T2N = FMA(KP769338817, T2E, T2D);
|
||||
T2v = FNMS(KP302775637, T2n, T2o);
|
||||
T2s = FMA(KP038632954, T2c, T2h);
|
||||
T2t = FMA(KP612264650, T2j, T2k);
|
||||
T2u = FNMS(KP853480001, T2t, T2s);
|
||||
T2A = FNMS(KP522026385, T2u, T2v);
|
||||
T2K = FMA(KP853480001, T2t, T2s);
|
||||
}
|
||||
T2p = FMA(KP302775637, T2o, T2n);
|
||||
T2i = FNMS(KP038632954, T2h, T2c);
|
||||
T2l = FNMS(KP612264650, T2k, T2j);
|
||||
T2m = FNMS(KP853480001, T2l, T2i);
|
||||
T2C = FMA(KP853480001, T2l, T2i);
|
||||
T2M = FNMS(KP522026385, T2m, T2p);
|
||||
{
|
||||
E T26, T2y, T1V, T22, T2x;
|
||||
T26 = FNMS(KP514918778, T25, T24);
|
||||
T2y = FNMS(KP859542535, T26, T27);
|
||||
T1V = FNMS(KP083333333, T1U, T1P);
|
||||
T22 = FMA(KP301479260, T21, T1Y);
|
||||
T2x = FNMS(KP251768516, T22, T1V);
|
||||
T23 = FMA(KP503537032, T22, T1V);
|
||||
T2J = FNMS(KP300462606, T2y, T2x);
|
||||
T28 = FMA(KP581704778, T27, T26);
|
||||
T2z = FMA(KP300462606, T2y, T2x);
|
||||
}
|
||||
{
|
||||
E T29, T2q, T2L, T2O;
|
||||
T29 = FNMS(KP516520780, T28, T23);
|
||||
T2q = FMA(KP957805992, T2p, T2m);
|
||||
io[WS(os, 5)] = FNMS(KP600477271, T2q, T29);
|
||||
io[WS(os, 8)] = FMA(KP600477271, T2q, T29);
|
||||
{
|
||||
E T2r, T2w, T2P, T2Q;
|
||||
T2r = FMA(KP516520780, T28, T23);
|
||||
T2w = FMA(KP957805992, T2v, T2u);
|
||||
io[WS(os, 1)] = FMA(KP600477271, T2w, T2r);
|
||||
io[WS(os, 12)] = FNMS(KP600477271, T2w, T2r);
|
||||
T2P = FMA(KP520028571, T2K, T2J);
|
||||
T2Q = FMA(KP875502302, T2N, T2M);
|
||||
io[WS(os, 6)] = FNMS(KP575140729, T2Q, T2P);
|
||||
io[WS(os, 11)] = FMA(KP575140729, T2Q, T2P);
|
||||
}
|
||||
T2L = FNMS(KP520028571, T2K, T2J);
|
||||
T2O = FNMS(KP875502302, T2N, T2M);
|
||||
io[WS(os, 2)] = FNMS(KP575140729, T2O, T2L);
|
||||
io[WS(os, 7)] = FMA(KP575140729, T2O, T2L);
|
||||
{
|
||||
E T2H, T2I, T2B, T2G;
|
||||
T2H = FNMS(KP575140729, T2A, T2z);
|
||||
T2I = FMA(KP968287244, T2F, T2C);
|
||||
io[WS(os, 4)] = FNMS(KP520028571, T2I, T2H);
|
||||
io[WS(os, 10)] = FMA(KP520028571, T2I, T2H);
|
||||
T2B = FMA(KP575140729, T2A, T2z);
|
||||
T2G = FNMS(KP968287244, T2F, T2C);
|
||||
io[WS(os, 3)] = FNMS(KP520028571, T2G, T2B);
|
||||
io[WS(os, 9)] = FMA(KP520028571, T2G, T2B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 13, "n1_13", { 62, 0, 114, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 176 FP additions, 68 FP multiplications,
|
||||
* (or, 138 additions, 30 multiplications, 38 fused multiply/add),
|
||||
* 71 stack variables, 20 constants, and 52 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
|
||||
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
|
||||
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
|
||||
DK(KP075902986, +0.075902986037193865983102897245103540356428373);
|
||||
DK(KP132983124, +0.132983124607418643793760531921092974399165133);
|
||||
DK(KP258260390, +0.258260390311744861420450644284508567852516811);
|
||||
DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
|
||||
DK(KP300238635, +0.300238635966332641462884626667381504676006424);
|
||||
DK(KP011599105, +0.011599105605768290721655456654083252189827041);
|
||||
DK(KP156891391, +0.156891391051584611046832726756003269660212636);
|
||||
DK(KP256247671, +0.256247671582936600958684654061725059144125175);
|
||||
DK(KP174138601, +0.174138601152135905005660794929264742616964676);
|
||||
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
|
||||
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
|
||||
DK(KP113854479, +0.113854479055790798974654345867655310534642560);
|
||||
DK(KP265966249, +0.265966249214837287587521063842185948798330267);
|
||||
DK(KP387390585, +0.387390585467617292130675966426762851778775217);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
|
||||
E T1, T1q, Tt, Tu, To, T22, T20, T24, TF, TH, TA, TI, T1X, T25, T2a;
|
||||
E T2d, T18, T1n, T2k, T2n, T1l, T1r, T1f, T1o, T2h, T2m;
|
||||
T1 = ri[0];
|
||||
T1q = ii[0];
|
||||
{
|
||||
E Tf, Tp, Tb, TC, Tx, T6, TB, Tw, Ti, Tq, Tl, Tr, Tm, Ts, Td;
|
||||
E Te, Tc, Tn;
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = Td + Te;
|
||||
Tp = Td - Te;
|
||||
{
|
||||
E T7, T8, T9, Ta;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 10)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
Tb = T7 + Ta;
|
||||
TC = T8 - T9;
|
||||
Tx = FNMS(KP500000000, Ta, T7);
|
||||
}
|
||||
{
|
||||
E T2, T3, T4, T5;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 3)];
|
||||
T4 = ri[WS(is, 9)];
|
||||
T5 = T3 + T4;
|
||||
T6 = T2 + T5;
|
||||
TB = T3 - T4;
|
||||
Tw = FNMS(KP500000000, T5, T2);
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ri[WS(is, 11)];
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = Tg + Th;
|
||||
Tq = Tg - Th;
|
||||
Tj = ri[WS(is, 7)];
|
||||
Tk = ri[WS(is, 2)];
|
||||
Tl = Tj + Tk;
|
||||
Tr = Tj - Tk;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
Ts = Tq + Tr;
|
||||
Tt = Tp + Ts;
|
||||
Tu = T6 - Tb;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Tf + Tm;
|
||||
To = Tc + Tn;
|
||||
T22 = KP300462606 * (Tc - Tn);
|
||||
{
|
||||
E T1Y, T1Z, TD, TE;
|
||||
T1Y = TB + TC;
|
||||
T1Z = Tq - Tr;
|
||||
T20 = T1Y - T1Z;
|
||||
T24 = T1Y + T1Z;
|
||||
TD = KP866025403 * (TB - TC);
|
||||
TE = FNMS(KP500000000, Ts, Tp);
|
||||
TF = TD - TE;
|
||||
TH = TD + TE;
|
||||
}
|
||||
{
|
||||
E Ty, Tz, T1V, T1W;
|
||||
Ty = Tw - Tx;
|
||||
Tz = KP866025403 * (Ti - Tl);
|
||||
TA = Ty + Tz;
|
||||
TI = Ty - Tz;
|
||||
T1V = Tw + Tx;
|
||||
T1W = FNMS(KP500000000, Tm, Tf);
|
||||
T1X = T1V - T1W;
|
||||
T25 = T1V + T1W;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TZ, T2b, TV, T1i, T1a, TQ, T1h, T19, T12, T1d, T15, T1c, T16, T2c, TX;
|
||||
E TY, TW, T17;
|
||||
TX = ii[WS(is, 8)];
|
||||
TY = ii[WS(is, 5)];
|
||||
TZ = TX + TY;
|
||||
T2b = TX - TY;
|
||||
{
|
||||
E TR, TS, TT, TU;
|
||||
TR = ii[WS(is, 12)];
|
||||
TS = ii[WS(is, 10)];
|
||||
TT = ii[WS(is, 4)];
|
||||
TU = TS + TT;
|
||||
TV = FNMS(KP500000000, TU, TR);
|
||||
T1i = TR + TU;
|
||||
T1a = TS - TT;
|
||||
}
|
||||
{
|
||||
E TM, TN, TO, TP;
|
||||
TM = ii[WS(is, 1)];
|
||||
TN = ii[WS(is, 3)];
|
||||
TO = ii[WS(is, 9)];
|
||||
TP = TN + TO;
|
||||
TQ = FNMS(KP500000000, TP, TM);
|
||||
T1h = TM + TP;
|
||||
T19 = TN - TO;
|
||||
}
|
||||
{
|
||||
E T10, T11, T13, T14;
|
||||
T10 = ii[WS(is, 11)];
|
||||
T11 = ii[WS(is, 6)];
|
||||
T12 = T10 + T11;
|
||||
T1d = T10 - T11;
|
||||
T13 = ii[WS(is, 7)];
|
||||
T14 = ii[WS(is, 2)];
|
||||
T15 = T13 + T14;
|
||||
T1c = T13 - T14;
|
||||
}
|
||||
T16 = T12 + T15;
|
||||
T2c = T1d + T1c;
|
||||
T2a = T1h - T1i;
|
||||
T2d = T2b + T2c;
|
||||
TW = TQ + TV;
|
||||
T17 = FNMS(KP500000000, T16, TZ);
|
||||
T18 = TW - T17;
|
||||
T1n = TW + T17;
|
||||
{
|
||||
E T2i, T2j, T1j, T1k;
|
||||
T2i = TQ - TV;
|
||||
T2j = KP866025403 * (T15 - T12);
|
||||
T2k = T2i + T2j;
|
||||
T2n = T2i - T2j;
|
||||
T1j = T1h + T1i;
|
||||
T1k = TZ + T16;
|
||||
T1l = KP300462606 * (T1j - T1k);
|
||||
T1r = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E T1b, T1e, T2f, T2g;
|
||||
T1b = T19 + T1a;
|
||||
T1e = T1c - T1d;
|
||||
T1f = T1b + T1e;
|
||||
T1o = T1e - T1b;
|
||||
T2f = FNMS(KP500000000, T2c, T2b);
|
||||
T2g = KP866025403 * (T1a - T19);
|
||||
T2h = T2f - T2g;
|
||||
T2m = T2g + T2f;
|
||||
}
|
||||
}
|
||||
ro[0] = T1 + To;
|
||||
io[0] = T1q + T1r;
|
||||
{
|
||||
E T1D, T1N, T1y, T1x, T1E, T1O, Tv, TK, T1J, T1Q, T1m, T1R, T1t, T1I, TG;
|
||||
E TJ;
|
||||
{
|
||||
E T1B, T1C, T1v, T1w;
|
||||
T1B = FMA(KP387390585, T1f, KP265966249 * T18);
|
||||
T1C = FMA(KP113854479, T1o, KP503537032 * T1n);
|
||||
T1D = T1B + T1C;
|
||||
T1N = T1C - T1B;
|
||||
T1y = FMA(KP575140729, Tu, KP174138601 * Tt);
|
||||
T1v = FNMS(KP156891391, TH, KP256247671 * TI);
|
||||
T1w = FMA(KP011599105, TF, KP300238635 * TA);
|
||||
T1x = T1v - T1w;
|
||||
T1E = T1y + T1x;
|
||||
T1O = KP1_732050807 * (T1v + T1w);
|
||||
}
|
||||
Tv = FNMS(KP174138601, Tu, KP575140729 * Tt);
|
||||
TG = FNMS(KP300238635, TF, KP011599105 * TA);
|
||||
TJ = FMA(KP256247671, TH, KP156891391 * TI);
|
||||
TK = TG - TJ;
|
||||
T1J = KP1_732050807 * (TJ + TG);
|
||||
T1Q = Tv - TK;
|
||||
{
|
||||
E T1g, T1H, T1p, T1s, T1G;
|
||||
T1g = FNMS(KP132983124, T1f, KP258260390 * T18);
|
||||
T1H = T1l - T1g;
|
||||
T1p = FNMS(KP251768516, T1o, KP075902986 * T1n);
|
||||
T1s = FNMS(KP083333333, T1r, T1q);
|
||||
T1G = T1s - T1p;
|
||||
T1m = FMA(KP2_000000000, T1g, T1l);
|
||||
T1R = T1H + T1G;
|
||||
T1t = FMA(KP2_000000000, T1p, T1s);
|
||||
T1I = T1G - T1H;
|
||||
}
|
||||
{
|
||||
E TL, T1u, T1P, T1S;
|
||||
TL = FMA(KP2_000000000, TK, Tv);
|
||||
T1u = T1m + T1t;
|
||||
io[WS(os, 1)] = TL + T1u;
|
||||
io[WS(os, 12)] = T1u - TL;
|
||||
{
|
||||
E T1z, T1A, T1T, T1U;
|
||||
T1z = FMS(KP2_000000000, T1x, T1y);
|
||||
T1A = T1t - T1m;
|
||||
io[WS(os, 5)] = T1z + T1A;
|
||||
io[WS(os, 8)] = T1A - T1z;
|
||||
T1T = T1R - T1Q;
|
||||
T1U = T1O + T1N;
|
||||
io[WS(os, 4)] = T1T - T1U;
|
||||
io[WS(os, 10)] = T1U + T1T;
|
||||
}
|
||||
T1P = T1N - T1O;
|
||||
T1S = T1Q + T1R;
|
||||
io[WS(os, 3)] = T1P + T1S;
|
||||
io[WS(os, 9)] = T1S - T1P;
|
||||
{
|
||||
E T1L, T1M, T1F, T1K;
|
||||
T1L = T1J + T1I;
|
||||
T1M = T1E + T1D;
|
||||
io[WS(os, 6)] = T1L - T1M;
|
||||
io[WS(os, 11)] = T1M + T1L;
|
||||
T1F = T1D - T1E;
|
||||
T1K = T1I - T1J;
|
||||
io[WS(os, 2)] = T1F + T1K;
|
||||
io[WS(os, 7)] = T1K - T1F;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2y, T2I, T2J, T2K, T2B, T2L, T2e, T2p, T2u, T2G, T23, T2F, T28, T2t, T2l;
|
||||
E T2o;
|
||||
{
|
||||
E T2w, T2x, T2z, T2A;
|
||||
T2w = FMA(KP387390585, T20, KP265966249 * T1X);
|
||||
T2x = FNMS(KP503537032, T25, KP113854479 * T24);
|
||||
T2y = T2w + T2x;
|
||||
T2I = T2w - T2x;
|
||||
T2J = FMA(KP575140729, T2a, KP174138601 * T2d);
|
||||
T2z = FNMS(KP300238635, T2n, KP011599105 * T2m);
|
||||
T2A = FNMS(KP156891391, T2h, KP256247671 * T2k);
|
||||
T2K = T2z + T2A;
|
||||
T2B = KP1_732050807 * (T2z - T2A);
|
||||
T2L = T2J + T2K;
|
||||
}
|
||||
T2e = FNMS(KP575140729, T2d, KP174138601 * T2a);
|
||||
T2l = FMA(KP256247671, T2h, KP156891391 * T2k);
|
||||
T2o = FMA(KP300238635, T2m, KP011599105 * T2n);
|
||||
T2p = T2l - T2o;
|
||||
T2u = T2e - T2p;
|
||||
T2G = KP1_732050807 * (T2o + T2l);
|
||||
{
|
||||
E T21, T2r, T26, T27, T2s;
|
||||
T21 = FNMS(KP132983124, T20, KP258260390 * T1X);
|
||||
T2r = T22 - T21;
|
||||
T26 = FMA(KP251768516, T24, KP075902986 * T25);
|
||||
T27 = FNMS(KP083333333, To, T1);
|
||||
T2s = T27 - T26;
|
||||
T23 = FMA(KP2_000000000, T21, T22);
|
||||
T2F = T2s - T2r;
|
||||
T28 = FMA(KP2_000000000, T26, T27);
|
||||
T2t = T2r + T2s;
|
||||
}
|
||||
{
|
||||
E T29, T2q, T2N, T2O;
|
||||
T29 = T23 + T28;
|
||||
T2q = FMA(KP2_000000000, T2p, T2e);
|
||||
ro[WS(os, 12)] = T29 - T2q;
|
||||
ro[WS(os, 1)] = T29 + T2q;
|
||||
{
|
||||
E T2v, T2C, T2P, T2Q;
|
||||
T2v = T2t - T2u;
|
||||
T2C = T2y - T2B;
|
||||
ro[WS(os, 10)] = T2v - T2C;
|
||||
ro[WS(os, 4)] = T2v + T2C;
|
||||
T2P = T28 - T23;
|
||||
T2Q = FMS(KP2_000000000, T2K, T2J);
|
||||
ro[WS(os, 5)] = T2P - T2Q;
|
||||
ro[WS(os, 8)] = T2P + T2Q;
|
||||
}
|
||||
T2N = T2F - T2G;
|
||||
T2O = T2L - T2I;
|
||||
ro[WS(os, 11)] = T2N - T2O;
|
||||
ro[WS(os, 6)] = T2N + T2O;
|
||||
{
|
||||
E T2H, T2M, T2D, T2E;
|
||||
T2H = T2F + T2G;
|
||||
T2M = T2I + T2L;
|
||||
ro[WS(os, 7)] = T2H - T2M;
|
||||
ro[WS(os, 2)] = T2H + T2M;
|
||||
T2D = T2t + T2u;
|
||||
T2E = T2y + T2B;
|
||||
ro[WS(os, 3)] = T2D - T2E;
|
||||
ro[WS(os, 9)] = T2D + T2E;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 13, "n1_13", { 138, 30, 38, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 148 FP additions, 84 FP multiplications,
|
||||
* (or, 64 additions, 0 multiplications, 84 fused multiply/add),
|
||||
* 67 stack variables, 6 constants, and 56 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
|
||||
E T3, Tp, T1b, T1x, T1i, T1L, T1M, T1j, T1k, T1K, Ta, To, Th, Tz, T14;
|
||||
E TZ, Ts, Ty, Tv, T1Z, T2c, T27, TI, T23, T24, TP, TW, T22, T1c, T1e;
|
||||
E T1d, T1f, T1s, T1n, T1A, T1G, T1D, T1H, T1U, T1P;
|
||||
{
|
||||
E T1, T2, T19, T1a;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 7)];
|
||||
T3 = T1 - T2;
|
||||
Tp = T1 + T2;
|
||||
T19 = ii[0];
|
||||
T1a = ii[WS(is, 7)];
|
||||
T1b = T19 - T1a;
|
||||
T1x = T19 + T1a;
|
||||
}
|
||||
{
|
||||
E T6, Tq, T9, Tr, Tn, Tx, Tk, Tw, Tg, Tu, Td, Tt;
|
||||
{
|
||||
E T4, T5, Ti, Tj;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 9)];
|
||||
T6 = T4 - T5;
|
||||
Tq = T4 + T5;
|
||||
{
|
||||
E T7, T8, Tl, Tm;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 5)];
|
||||
T9 = T7 - T8;
|
||||
Tr = T7 + T8;
|
||||
Tl = ri[WS(is, 8)];
|
||||
Tm = ri[WS(is, 1)];
|
||||
Tn = Tl - Tm;
|
||||
Tx = Tl + Tm;
|
||||
}
|
||||
Ti = ri[WS(is, 6)];
|
||||
Tj = ri[WS(is, 13)];
|
||||
Tk = Ti - Tj;
|
||||
Tw = Ti + Tj;
|
||||
{
|
||||
E Te, Tf, Tb, Tc;
|
||||
Te = ri[WS(is, 10)];
|
||||
Tf = ri[WS(is, 3)];
|
||||
Tg = Te - Tf;
|
||||
Tu = Te + Tf;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 11)];
|
||||
Td = Tb - Tc;
|
||||
Tt = Tb + Tc;
|
||||
}
|
||||
}
|
||||
T1i = Tn - Tk;
|
||||
T1L = Tt - Tu;
|
||||
T1M = Tr - Tq;
|
||||
T1j = Tg - Td;
|
||||
T1k = T9 - T6;
|
||||
T1K = Tw - Tx;
|
||||
Ta = T6 + T9;
|
||||
To = Tk + Tn;
|
||||
Th = Td + Tg;
|
||||
Tz = FNMS(KP356895867, Th, Ta);
|
||||
T14 = FNMS(KP356895867, To, Th);
|
||||
TZ = FNMS(KP356895867, Ta, To);
|
||||
Ts = Tq + Tr;
|
||||
Ty = Tw + Tx;
|
||||
Tv = Tt + Tu;
|
||||
T1Z = FNMS(KP356895867, Ts, Ty);
|
||||
T2c = FNMS(KP356895867, Ty, Tv);
|
||||
T27 = FNMS(KP356895867, Tv, Ts);
|
||||
}
|
||||
{
|
||||
E TE, T1B, TH, T1C, TV, T1F, TS, T1E, TO, T1z, TL, T1y;
|
||||
{
|
||||
E TC, TD, TQ, TR;
|
||||
TC = ii[WS(is, 4)];
|
||||
TD = ii[WS(is, 11)];
|
||||
TE = TC - TD;
|
||||
T1B = TC + TD;
|
||||
{
|
||||
E TF, TG, TT, TU;
|
||||
TF = ii[WS(is, 10)];
|
||||
TG = ii[WS(is, 3)];
|
||||
TH = TF - TG;
|
||||
T1C = TF + TG;
|
||||
TT = ii[WS(is, 8)];
|
||||
TU = ii[WS(is, 1)];
|
||||
TV = TT - TU;
|
||||
T1F = TT + TU;
|
||||
}
|
||||
TQ = ii[WS(is, 6)];
|
||||
TR = ii[WS(is, 13)];
|
||||
TS = TQ - TR;
|
||||
T1E = TQ + TR;
|
||||
{
|
||||
E TM, TN, TJ, TK;
|
||||
TM = ii[WS(is, 12)];
|
||||
TN = ii[WS(is, 5)];
|
||||
TO = TM - TN;
|
||||
T1z = TM + TN;
|
||||
TJ = ii[WS(is, 2)];
|
||||
TK = ii[WS(is, 9)];
|
||||
TL = TJ - TK;
|
||||
T1y = TJ + TK;
|
||||
}
|
||||
}
|
||||
TI = TE - TH;
|
||||
T23 = T1F - T1E;
|
||||
T24 = T1C - T1B;
|
||||
TP = TL - TO;
|
||||
TW = TS - TV;
|
||||
T22 = T1y - T1z;
|
||||
T1c = TL + TO;
|
||||
T1e = TS + TV;
|
||||
T1d = TE + TH;
|
||||
T1f = FNMS(KP356895867, T1e, T1d);
|
||||
T1s = FNMS(KP356895867, T1d, T1c);
|
||||
T1n = FNMS(KP356895867, T1c, T1e);
|
||||
T1A = T1y + T1z;
|
||||
T1G = T1E + T1F;
|
||||
T1D = T1B + T1C;
|
||||
T1H = FNMS(KP356895867, T1G, T1D);
|
||||
T1U = FNMS(KP356895867, T1D, T1A);
|
||||
T1P = FNMS(KP356895867, T1A, T1G);
|
||||
}
|
||||
ro[WS(os, 7)] = T3 + Ta + Th + To;
|
||||
io[WS(os, 7)] = T1b + T1c + T1d + T1e;
|
||||
ro[0] = Tp + Ts + Tv + Ty;
|
||||
io[0] = T1x + T1A + T1D + T1G;
|
||||
{
|
||||
E TB, TY, TA, TX;
|
||||
TA = FNMS(KP692021471, Tz, To);
|
||||
TB = FNMS(KP900968867, TA, T3);
|
||||
TX = FMA(KP554958132, TW, TP);
|
||||
TY = FMA(KP801937735, TX, TI);
|
||||
ro[WS(os, 13)] = FNMS(KP974927912, TY, TB);
|
||||
ro[WS(os, 1)] = FMA(KP974927912, TY, TB);
|
||||
}
|
||||
{
|
||||
E T1u, T1w, T1t, T1v;
|
||||
T1t = FNMS(KP692021471, T1s, T1e);
|
||||
T1u = FNMS(KP900968867, T1t, T1b);
|
||||
T1v = FMA(KP554958132, T1i, T1k);
|
||||
T1w = FMA(KP801937735, T1v, T1j);
|
||||
io[WS(os, 1)] = FMA(KP974927912, T1w, T1u);
|
||||
io[WS(os, 13)] = FNMS(KP974927912, T1w, T1u);
|
||||
}
|
||||
{
|
||||
E T11, T13, T10, T12;
|
||||
T10 = FNMS(KP692021471, TZ, Th);
|
||||
T11 = FNMS(KP900968867, T10, T3);
|
||||
T12 = FMA(KP554958132, TI, TW);
|
||||
T13 = FNMS(KP801937735, T12, TP);
|
||||
ro[WS(os, 5)] = FNMS(KP974927912, T13, T11);
|
||||
ro[WS(os, 9)] = FMA(KP974927912, T13, T11);
|
||||
}
|
||||
{
|
||||
E T1p, T1r, T1o, T1q;
|
||||
T1o = FNMS(KP692021471, T1n, T1d);
|
||||
T1p = FNMS(KP900968867, T1o, T1b);
|
||||
T1q = FMA(KP554958132, T1j, T1i);
|
||||
T1r = FNMS(KP801937735, T1q, T1k);
|
||||
io[WS(os, 5)] = FNMS(KP974927912, T1r, T1p);
|
||||
io[WS(os, 9)] = FMA(KP974927912, T1r, T1p);
|
||||
}
|
||||
{
|
||||
E T16, T18, T15, T17;
|
||||
T15 = FNMS(KP692021471, T14, Ta);
|
||||
T16 = FNMS(KP900968867, T15, T3);
|
||||
T17 = FNMS(KP554958132, TP, TI);
|
||||
T18 = FNMS(KP801937735, T17, TW);
|
||||
ro[WS(os, 11)] = FNMS(KP974927912, T18, T16);
|
||||
ro[WS(os, 3)] = FMA(KP974927912, T18, T16);
|
||||
}
|
||||
{
|
||||
E T1h, T1m, T1g, T1l;
|
||||
T1g = FNMS(KP692021471, T1f, T1c);
|
||||
T1h = FNMS(KP900968867, T1g, T1b);
|
||||
T1l = FNMS(KP554958132, T1k, T1j);
|
||||
T1m = FNMS(KP801937735, T1l, T1i);
|
||||
io[WS(os, 3)] = FMA(KP974927912, T1m, T1h);
|
||||
io[WS(os, 11)] = FNMS(KP974927912, T1m, T1h);
|
||||
}
|
||||
{
|
||||
E T1J, T1O, T1I, T1N;
|
||||
T1I = FNMS(KP692021471, T1H, T1A);
|
||||
T1J = FNMS(KP900968867, T1I, T1x);
|
||||
T1N = FMA(KP554958132, T1M, T1L);
|
||||
T1O = FNMS(KP801937735, T1N, T1K);
|
||||
io[WS(os, 4)] = FMA(KP974927912, T1O, T1J);
|
||||
io[WS(os, 10)] = FNMS(KP974927912, T1O, T1J);
|
||||
}
|
||||
{
|
||||
E T2e, T2g, T2d, T2f;
|
||||
T2d = FNMS(KP692021471, T2c, Ts);
|
||||
T2e = FNMS(KP900968867, T2d, Tp);
|
||||
T2f = FMA(KP554958132, T22, T24);
|
||||
T2g = FNMS(KP801937735, T2f, T23);
|
||||
ro[WS(os, 10)] = FNMS(KP974927912, T2g, T2e);
|
||||
ro[WS(os, 4)] = FMA(KP974927912, T2g, T2e);
|
||||
}
|
||||
{
|
||||
E T1R, T1T, T1Q, T1S;
|
||||
T1Q = FNMS(KP692021471, T1P, T1D);
|
||||
T1R = FNMS(KP900968867, T1Q, T1x);
|
||||
T1S = FMA(KP554958132, T1L, T1K);
|
||||
T1T = FMA(KP801937735, T1S, T1M);
|
||||
io[WS(os, 2)] = FMA(KP974927912, T1T, T1R);
|
||||
io[WS(os, 12)] = FNMS(KP974927912, T1T, T1R);
|
||||
}
|
||||
{
|
||||
E T21, T26, T20, T25;
|
||||
T20 = FNMS(KP692021471, T1Z, Tv);
|
||||
T21 = FNMS(KP900968867, T20, Tp);
|
||||
T25 = FMA(KP554958132, T24, T23);
|
||||
T26 = FMA(KP801937735, T25, T22);
|
||||
ro[WS(os, 12)] = FNMS(KP974927912, T26, T21);
|
||||
ro[WS(os, 2)] = FMA(KP974927912, T26, T21);
|
||||
}
|
||||
{
|
||||
E T1W, T1Y, T1V, T1X;
|
||||
T1V = FNMS(KP692021471, T1U, T1G);
|
||||
T1W = FNMS(KP900968867, T1V, T1x);
|
||||
T1X = FNMS(KP554958132, T1K, T1M);
|
||||
T1Y = FNMS(KP801937735, T1X, T1L);
|
||||
io[WS(os, 6)] = FMA(KP974927912, T1Y, T1W);
|
||||
io[WS(os, 8)] = FNMS(KP974927912, T1Y, T1W);
|
||||
}
|
||||
{
|
||||
E T29, T2b, T28, T2a;
|
||||
T28 = FNMS(KP692021471, T27, Ty);
|
||||
T29 = FNMS(KP900968867, T28, Tp);
|
||||
T2a = FNMS(KP554958132, T23, T22);
|
||||
T2b = FNMS(KP801937735, T2a, T24);
|
||||
ro[WS(os, 8)] = FNMS(KP974927912, T2b, T29);
|
||||
ro[WS(os, 6)] = FMA(KP974927912, T2b, T29);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 14, "n1_14", { 64, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 148 FP additions, 72 FP multiplications,
|
||||
* (or, 100 additions, 24 multiplications, 48 fused multiply/add),
|
||||
* 43 stack variables, 6 constants, and 56 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
|
||||
E T3, Tp, T16, T1f, Ta, T1q, Ts, T10, TG, T1z, T19, T1i, Th, T1s, Tv;
|
||||
E T12, TU, T1B, T17, T1o, To, T1r, Ty, T11, TN, T1A, T18, T1l;
|
||||
{
|
||||
E T1, T2, T14, T15;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 7)];
|
||||
T3 = T1 - T2;
|
||||
Tp = T1 + T2;
|
||||
T14 = ii[0];
|
||||
T15 = ii[WS(is, 7)];
|
||||
T16 = T14 - T15;
|
||||
T1f = T14 + T15;
|
||||
}
|
||||
{
|
||||
E T6, Tq, T9, Tr;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 9)];
|
||||
T6 = T4 - T5;
|
||||
Tq = T4 + T5;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 5)];
|
||||
T9 = T7 - T8;
|
||||
Tr = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1q = Tr - Tq;
|
||||
Ts = Tq + Tr;
|
||||
T10 = T9 - T6;
|
||||
}
|
||||
{
|
||||
E TC, T1g, TF, T1h;
|
||||
{
|
||||
E TA, TB, TD, TE;
|
||||
TA = ii[WS(is, 2)];
|
||||
TB = ii[WS(is, 9)];
|
||||
TC = TA - TB;
|
||||
T1g = TA + TB;
|
||||
TD = ii[WS(is, 12)];
|
||||
TE = ii[WS(is, 5)];
|
||||
TF = TD - TE;
|
||||
T1h = TD + TE;
|
||||
}
|
||||
TG = TC - TF;
|
||||
T1z = T1g - T1h;
|
||||
T19 = TC + TF;
|
||||
T1i = T1g + T1h;
|
||||
}
|
||||
{
|
||||
E Td, Tt, Tg, Tu;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 11)];
|
||||
Td = Tb - Tc;
|
||||
Tt = Tb + Tc;
|
||||
Te = ri[WS(is, 10)];
|
||||
Tf = ri[WS(is, 3)];
|
||||
Tg = Te - Tf;
|
||||
Tu = Te + Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
T1s = Tt - Tu;
|
||||
Tv = Tt + Tu;
|
||||
T12 = Tg - Td;
|
||||
}
|
||||
{
|
||||
E TQ, T1m, TT, T1n;
|
||||
{
|
||||
E TO, TP, TR, TS;
|
||||
TO = ii[WS(is, 4)];
|
||||
TP = ii[WS(is, 11)];
|
||||
TQ = TO - TP;
|
||||
T1m = TO + TP;
|
||||
TR = ii[WS(is, 10)];
|
||||
TS = ii[WS(is, 3)];
|
||||
TT = TR - TS;
|
||||
T1n = TR + TS;
|
||||
}
|
||||
TU = TQ - TT;
|
||||
T1B = T1n - T1m;
|
||||
T17 = TQ + TT;
|
||||
T1o = T1m + T1n;
|
||||
}
|
||||
{
|
||||
E Tk, Tw, Tn, Tx;
|
||||
{
|
||||
E Ti, Tj, Tl, Tm;
|
||||
Ti = ri[WS(is, 6)];
|
||||
Tj = ri[WS(is, 13)];
|
||||
Tk = Ti - Tj;
|
||||
Tw = Ti + Tj;
|
||||
Tl = ri[WS(is, 8)];
|
||||
Tm = ri[WS(is, 1)];
|
||||
Tn = Tl - Tm;
|
||||
Tx = Tl + Tm;
|
||||
}
|
||||
To = Tk + Tn;
|
||||
T1r = Tw - Tx;
|
||||
Ty = Tw + Tx;
|
||||
T11 = Tn - Tk;
|
||||
}
|
||||
{
|
||||
E TJ, T1j, TM, T1k;
|
||||
{
|
||||
E TH, TI, TK, TL;
|
||||
TH = ii[WS(is, 6)];
|
||||
TI = ii[WS(is, 13)];
|
||||
TJ = TH - TI;
|
||||
T1j = TH + TI;
|
||||
TK = ii[WS(is, 8)];
|
||||
TL = ii[WS(is, 1)];
|
||||
TM = TK - TL;
|
||||
T1k = TK + TL;
|
||||
}
|
||||
TN = TJ - TM;
|
||||
T1A = T1k - T1j;
|
||||
T18 = TJ + TM;
|
||||
T1l = T1j + T1k;
|
||||
}
|
||||
ro[WS(os, 7)] = T3 + Ta + Th + To;
|
||||
io[WS(os, 7)] = T16 + T19 + T17 + T18;
|
||||
ro[0] = Tp + Ts + Tv + Ty;
|
||||
io[0] = T1f + T1i + T1o + T1l;
|
||||
{
|
||||
E TV, Tz, T1e, T1d;
|
||||
TV = FNMS(KP781831482, TN, KP974927912 * TG) - (KP433883739 * TU);
|
||||
Tz = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
|
||||
ro[WS(os, 5)] = Tz - TV;
|
||||
ro[WS(os, 9)] = Tz + TV;
|
||||
T1e = FNMS(KP781831482, T11, KP974927912 * T10) - (KP433883739 * T12);
|
||||
T1d = FMA(KP623489801, T18, T16) + FNMA(KP900968867, T17, KP222520933 * T19);
|
||||
io[WS(os, 5)] = T1d - T1e;
|
||||
io[WS(os, 9)] = T1e + T1d;
|
||||
}
|
||||
{
|
||||
E TX, TW, T1b, T1c;
|
||||
TX = FMA(KP781831482, TG, KP974927912 * TU) + (KP433883739 * TN);
|
||||
TW = FMA(KP623489801, Ta, T3) + FNMA(KP900968867, To, KP222520933 * Th);
|
||||
ro[WS(os, 13)] = TW - TX;
|
||||
ro[WS(os, 1)] = TW + TX;
|
||||
T1b = FMA(KP781831482, T10, KP974927912 * T12) + (KP433883739 * T11);
|
||||
T1c = FMA(KP623489801, T19, T16) + FNMA(KP900968867, T18, KP222520933 * T17);
|
||||
io[WS(os, 1)] = T1b + T1c;
|
||||
io[WS(os, 13)] = T1c - T1b;
|
||||
}
|
||||
{
|
||||
E TZ, TY, T13, T1a;
|
||||
TZ = FMA(KP433883739, TG, KP974927912 * TN) - (KP781831482 * TU);
|
||||
TY = FMA(KP623489801, Th, T3) + FNMA(KP222520933, To, KP900968867 * Ta);
|
||||
ro[WS(os, 11)] = TY - TZ;
|
||||
ro[WS(os, 3)] = TY + TZ;
|
||||
T13 = FMA(KP433883739, T10, KP974927912 * T11) - (KP781831482 * T12);
|
||||
T1a = FMA(KP623489801, T17, T16) + FNMA(KP222520933, T18, KP900968867 * T19);
|
||||
io[WS(os, 3)] = T13 + T1a;
|
||||
io[WS(os, 11)] = T1a - T13;
|
||||
}
|
||||
{
|
||||
E T1t, T1p, T1C, T1y;
|
||||
T1t = FNMS(KP433883739, T1r, KP781831482 * T1q) - (KP974927912 * T1s);
|
||||
T1p = FMA(KP623489801, T1i, T1f) + FNMA(KP900968867, T1l, KP222520933 * T1o);
|
||||
io[WS(os, 6)] = T1p - T1t;
|
||||
io[WS(os, 8)] = T1t + T1p;
|
||||
T1C = FNMS(KP433883739, T1A, KP781831482 * T1z) - (KP974927912 * T1B);
|
||||
T1y = FMA(KP623489801, Ts, Tp) + FNMA(KP900968867, Ty, KP222520933 * Tv);
|
||||
ro[WS(os, 6)] = T1y - T1C;
|
||||
ro[WS(os, 8)] = T1y + T1C;
|
||||
}
|
||||
{
|
||||
E T1v, T1u, T1E, T1D;
|
||||
T1v = FMA(KP433883739, T1q, KP781831482 * T1s) - (KP974927912 * T1r);
|
||||
T1u = FMA(KP623489801, T1o, T1f) + FNMA(KP222520933, T1l, KP900968867 * T1i);
|
||||
io[WS(os, 4)] = T1u - T1v;
|
||||
io[WS(os, 10)] = T1v + T1u;
|
||||
T1E = FMA(KP433883739, T1z, KP781831482 * T1B) - (KP974927912 * T1A);
|
||||
T1D = FMA(KP623489801, Tv, Tp) + FNMA(KP222520933, Ty, KP900968867 * Ts);
|
||||
ro[WS(os, 4)] = T1D - T1E;
|
||||
ro[WS(os, 10)] = T1D + T1E;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1G, T1F;
|
||||
T1w = FMA(KP974927912, T1q, KP433883739 * T1s) + (KP781831482 * T1r);
|
||||
T1x = FMA(KP623489801, T1l, T1f) + FNMA(KP900968867, T1o, KP222520933 * T1i);
|
||||
io[WS(os, 2)] = T1w + T1x;
|
||||
io[WS(os, 12)] = T1x - T1w;
|
||||
T1G = FMA(KP974927912, T1z, KP433883739 * T1B) + (KP781831482 * T1A);
|
||||
T1F = FMA(KP623489801, Ty, Tp) + FNMA(KP900968867, Tv, KP222520933 * Ts);
|
||||
ro[WS(os, 12)] = T1F - T1G;
|
||||
ro[WS(os, 2)] = T1F + T1G;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 14, "n1_14", { 100, 24, 48, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,554 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 156 FP additions, 84 FP multiplications,
|
||||
* (or, 72 additions, 0 multiplications, 84 fused multiply/add),
|
||||
* 69 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
|
||||
E T5, T2l, Tx, TV, T1z, T1X, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
|
||||
E T1O, T1P, T1Z, T1l, T1q, T1B, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
|
||||
E T2f, T2g, T2m, T1R, T1S, T1Y, T1a, T1f, T1A, TW, TX, TY;
|
||||
{
|
||||
E T1, T1v, T4, T1y, Tw, T1w, Tt, T1x;
|
||||
T1 = ri[0];
|
||||
T1v = ii[0];
|
||||
{
|
||||
E T2, T3, Tu, Tv;
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
T1y = T3 - T2;
|
||||
Tu = ii[WS(is, 5)];
|
||||
Tv = ii[WS(is, 10)];
|
||||
Tw = Tu - Tv;
|
||||
T1w = Tu + Tv;
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
T2l = T1v + T1w;
|
||||
Tt = FNMS(KP500000000, T4, T1);
|
||||
Tx = FNMS(KP866025403, Tw, Tt);
|
||||
TV = FMA(KP866025403, Tw, Tt);
|
||||
T1x = FNMS(KP500000000, T1w, T1v);
|
||||
T1z = FMA(KP866025403, T1y, T1x);
|
||||
T1X = FNMS(KP866025403, T1y, T1x);
|
||||
}
|
||||
{
|
||||
E Th, Tk, TJ, T1k, T1h, T1i, TM, T1j, Tm, Tp, TO, T1p, T1m, T1n, TR;
|
||||
E T1o;
|
||||
{
|
||||
E Ti, Tj, TK, TL;
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = ri[WS(is, 11)];
|
||||
Tj = ri[WS(is, 1)];
|
||||
Tk = Ti + Tj;
|
||||
TJ = FNMS(KP500000000, Tk, Th);
|
||||
T1k = Tj - Ti;
|
||||
T1h = ii[WS(is, 6)];
|
||||
TK = ii[WS(is, 11)];
|
||||
TL = ii[WS(is, 1)];
|
||||
T1i = TK + TL;
|
||||
TM = TK - TL;
|
||||
T1j = FNMS(KP500000000, T1i, T1h);
|
||||
}
|
||||
{
|
||||
E Tn, To, TP, TQ;
|
||||
Tm = ri[WS(is, 9)];
|
||||
Tn = ri[WS(is, 14)];
|
||||
To = ri[WS(is, 4)];
|
||||
Tp = Tn + To;
|
||||
TO = FNMS(KP500000000, Tp, Tm);
|
||||
T1p = To - Tn;
|
||||
T1m = ii[WS(is, 9)];
|
||||
TP = ii[WS(is, 14)];
|
||||
TQ = ii[WS(is, 4)];
|
||||
T1n = TP + TQ;
|
||||
TR = TP - TQ;
|
||||
T1o = FNMS(KP500000000, T1n, T1m);
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
Tq = Tm + Tp;
|
||||
Tr = Tl + Tq;
|
||||
TN = FNMS(KP866025403, TM, TJ);
|
||||
TS = FNMS(KP866025403, TR, TO);
|
||||
TT = TN + TS;
|
||||
T2c = T1h + T1i;
|
||||
T2d = T1m + T1n;
|
||||
T2n = T2c + T2d;
|
||||
T1O = FNMS(KP866025403, T1k, T1j);
|
||||
T1P = FNMS(KP866025403, T1p, T1o);
|
||||
T1Z = T1O + T1P;
|
||||
T1l = FMA(KP866025403, T1k, T1j);
|
||||
T1q = FMA(KP866025403, T1p, T1o);
|
||||
T1B = T1l + T1q;
|
||||
TZ = FMA(KP866025403, TM, TJ);
|
||||
T10 = FMA(KP866025403, TR, TO);
|
||||
T11 = TZ + T10;
|
||||
}
|
||||
{
|
||||
E T6, T9, Ty, T19, T16, T17, TB, T18, Tb, Te, TD, T1e, T1b, T1c, TG;
|
||||
E T1d;
|
||||
{
|
||||
E T7, T8, Tz, TA;
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 13)];
|
||||
T9 = T7 + T8;
|
||||
Ty = FNMS(KP500000000, T9, T6);
|
||||
T19 = T8 - T7;
|
||||
T16 = ii[WS(is, 3)];
|
||||
Tz = ii[WS(is, 8)];
|
||||
TA = ii[WS(is, 13)];
|
||||
T17 = Tz + TA;
|
||||
TB = Tz - TA;
|
||||
T18 = FNMS(KP500000000, T17, T16);
|
||||
}
|
||||
{
|
||||
E Tc, Td, TE, TF;
|
||||
Tb = ri[WS(is, 12)];
|
||||
Tc = ri[WS(is, 2)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = Tc + Td;
|
||||
TD = FNMS(KP500000000, Te, Tb);
|
||||
T1e = Td - Tc;
|
||||
T1b = ii[WS(is, 12)];
|
||||
TE = ii[WS(is, 2)];
|
||||
TF = ii[WS(is, 7)];
|
||||
T1c = TE + TF;
|
||||
TG = TE - TF;
|
||||
T1d = FNMS(KP500000000, T1c, T1b);
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Tf = Tb + Te;
|
||||
Tg = Ta + Tf;
|
||||
TC = FNMS(KP866025403, TB, Ty);
|
||||
TH = FNMS(KP866025403, TG, TD);
|
||||
TI = TC + TH;
|
||||
T2f = T16 + T17;
|
||||
T2g = T1b + T1c;
|
||||
T2m = T2f + T2g;
|
||||
T1R = FNMS(KP866025403, T19, T18);
|
||||
T1S = FNMS(KP866025403, T1e, T1d);
|
||||
T1Y = T1R + T1S;
|
||||
T1a = FMA(KP866025403, T19, T18);
|
||||
T1f = FMA(KP866025403, T1e, T1d);
|
||||
T1A = T1a + T1f;
|
||||
TW = FMA(KP866025403, TB, Ty);
|
||||
TX = FMA(KP866025403, TG, TD);
|
||||
TY = TW + TX;
|
||||
}
|
||||
{
|
||||
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
|
||||
T2a = Tg - Tr;
|
||||
Ts = Tg + Tr;
|
||||
T29 = FNMS(KP250000000, Ts, T5);
|
||||
T2e = T2c - T2d;
|
||||
T2h = T2f - T2g;
|
||||
T2i = FNMS(KP618033988, T2h, T2e);
|
||||
T2k = FMA(KP618033988, T2e, T2h);
|
||||
ro[0] = T5 + Ts;
|
||||
T2j = FMA(KP559016994, T2a, T29);
|
||||
ro[WS(os, 9)] = FNMS(KP951056516, T2k, T2j);
|
||||
ro[WS(os, 6)] = FMA(KP951056516, T2k, T2j);
|
||||
T2b = FNMS(KP559016994, T2a, T29);
|
||||
ro[WS(os, 12)] = FNMS(KP951056516, T2i, T2b);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, T2i, T2b);
|
||||
}
|
||||
{
|
||||
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
|
||||
T2q = T2m - T2n;
|
||||
T2o = T2m + T2n;
|
||||
T2p = FNMS(KP250000000, T2o, T2l);
|
||||
T2s = Tl - Tq;
|
||||
T2t = Ta - Tf;
|
||||
T2u = FNMS(KP618033988, T2t, T2s);
|
||||
T2w = FMA(KP618033988, T2s, T2t);
|
||||
io[0] = T2l + T2o;
|
||||
T2v = FMA(KP559016994, T2q, T2p);
|
||||
io[WS(os, 6)] = FNMS(KP951056516, T2w, T2v);
|
||||
io[WS(os, 9)] = FMA(KP951056516, T2w, T2v);
|
||||
T2r = FNMS(KP559016994, T2q, T2p);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, T2u, T2r);
|
||||
io[WS(os, 12)] = FMA(KP951056516, T2u, T2r);
|
||||
}
|
||||
{
|
||||
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
|
||||
T1M = TI - TT;
|
||||
TU = TI + TT;
|
||||
T1L = FNMS(KP250000000, TU, Tx);
|
||||
T1Q = T1O - T1P;
|
||||
T1T = T1R - T1S;
|
||||
T1U = FNMS(KP618033988, T1T, T1Q);
|
||||
T1W = FMA(KP618033988, T1Q, T1T);
|
||||
ro[WS(os, 5)] = Tx + TU;
|
||||
T1V = FMA(KP559016994, T1M, T1L);
|
||||
ro[WS(os, 14)] = FNMS(KP951056516, T1W, T1V);
|
||||
ro[WS(os, 11)] = FMA(KP951056516, T1W, T1V);
|
||||
T1N = FNMS(KP559016994, T1M, T1L);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, T1U, T1N);
|
||||
ro[WS(os, 8)] = FMA(KP951056516, T1U, T1N);
|
||||
}
|
||||
{
|
||||
E T22, T20, T21, T26, T28, T24, T25, T27, T23;
|
||||
T22 = T1Y - T1Z;
|
||||
T20 = T1Y + T1Z;
|
||||
T21 = FNMS(KP250000000, T20, T1X);
|
||||
T24 = TN - TS;
|
||||
T25 = TC - TH;
|
||||
T26 = FNMS(KP618033988, T25, T24);
|
||||
T28 = FMA(KP618033988, T24, T25);
|
||||
io[WS(os, 5)] = T1X + T20;
|
||||
T27 = FMA(KP559016994, T22, T21);
|
||||
io[WS(os, 11)] = FNMS(KP951056516, T28, T27);
|
||||
io[WS(os, 14)] = FMA(KP951056516, T28, T27);
|
||||
T23 = FNMS(KP559016994, T22, T21);
|
||||
io[WS(os, 2)] = FMA(KP951056516, T26, T23);
|
||||
io[WS(os, 8)] = FNMS(KP951056516, T26, T23);
|
||||
}
|
||||
{
|
||||
E T1E, T1C, T1D, T1I, T1K, T1G, T1H, T1J, T1F;
|
||||
T1E = T1A - T1B;
|
||||
T1C = T1A + T1B;
|
||||
T1D = FNMS(KP250000000, T1C, T1z);
|
||||
T1G = TW - TX;
|
||||
T1H = TZ - T10;
|
||||
T1I = FMA(KP618033988, T1H, T1G);
|
||||
T1K = FNMS(KP618033988, T1G, T1H);
|
||||
io[WS(os, 10)] = T1z + T1C;
|
||||
T1J = FNMS(KP559016994, T1E, T1D);
|
||||
io[WS(os, 7)] = FMA(KP951056516, T1K, T1J);
|
||||
io[WS(os, 13)] = FNMS(KP951056516, T1K, T1J);
|
||||
T1F = FMA(KP559016994, T1E, T1D);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, T1I, T1F);
|
||||
io[WS(os, 4)] = FMA(KP951056516, T1I, T1F);
|
||||
}
|
||||
{
|
||||
E T14, T12, T13, T1s, T1u, T1g, T1r, T1t, T15;
|
||||
T14 = TY - T11;
|
||||
T12 = TY + T11;
|
||||
T13 = FNMS(KP250000000, T12, TV);
|
||||
T1g = T1a - T1f;
|
||||
T1r = T1l - T1q;
|
||||
T1s = FMA(KP618033988, T1r, T1g);
|
||||
T1u = FNMS(KP618033988, T1g, T1r);
|
||||
ro[WS(os, 10)] = TV + T12;
|
||||
T1t = FNMS(KP559016994, T14, T13);
|
||||
ro[WS(os, 7)] = FNMS(KP951056516, T1u, T1t);
|
||||
ro[WS(os, 13)] = FMA(KP951056516, T1u, T1t);
|
||||
T15 = FMA(KP559016994, T14, T13);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, T1s, T15);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, T1s, T15);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 15, "n1_15", { 72, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 156 FP additions, 56 FP multiplications,
|
||||
* (or, 128 additions, 28 multiplications, 28 fused multiply/add),
|
||||
* 69 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
|
||||
E T5, T2l, Tx, TV, T1C, T20, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
|
||||
E T1O, T1P, T22, T1l, T1q, T1w, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
|
||||
E T2f, T2g, T2m, T1R, T1S, T21, T1a, T1f, T1v, TW, TX, TY;
|
||||
{
|
||||
E T1, T1z, T4, T1y, Tw, T1A, Tt, T1B;
|
||||
T1 = ri[0];
|
||||
T1z = ii[0];
|
||||
{
|
||||
E T2, T3, Tu, Tv;
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
T1y = KP866025403 * (T3 - T2);
|
||||
Tu = ii[WS(is, 5)];
|
||||
Tv = ii[WS(is, 10)];
|
||||
Tw = KP866025403 * (Tu - Tv);
|
||||
T1A = Tu + Tv;
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
T2l = T1z + T1A;
|
||||
Tt = FNMS(KP500000000, T4, T1);
|
||||
Tx = Tt - Tw;
|
||||
TV = Tt + Tw;
|
||||
T1B = FNMS(KP500000000, T1A, T1z);
|
||||
T1C = T1y + T1B;
|
||||
T20 = T1B - T1y;
|
||||
}
|
||||
{
|
||||
E Th, Tk, TJ, T1h, T1i, T1j, TM, T1k, Tm, Tp, TO, T1m, T1n, T1o, TR;
|
||||
E T1p;
|
||||
{
|
||||
E Ti, Tj, TK, TL;
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = ri[WS(is, 11)];
|
||||
Tj = ri[WS(is, 1)];
|
||||
Tk = Ti + Tj;
|
||||
TJ = FNMS(KP500000000, Tk, Th);
|
||||
T1h = KP866025403 * (Tj - Ti);
|
||||
T1i = ii[WS(is, 6)];
|
||||
TK = ii[WS(is, 11)];
|
||||
TL = ii[WS(is, 1)];
|
||||
T1j = TK + TL;
|
||||
TM = KP866025403 * (TK - TL);
|
||||
T1k = FNMS(KP500000000, T1j, T1i);
|
||||
}
|
||||
{
|
||||
E Tn, To, TP, TQ;
|
||||
Tm = ri[WS(is, 9)];
|
||||
Tn = ri[WS(is, 14)];
|
||||
To = ri[WS(is, 4)];
|
||||
Tp = Tn + To;
|
||||
TO = FNMS(KP500000000, Tp, Tm);
|
||||
T1m = KP866025403 * (To - Tn);
|
||||
T1n = ii[WS(is, 9)];
|
||||
TP = ii[WS(is, 14)];
|
||||
TQ = ii[WS(is, 4)];
|
||||
T1o = TP + TQ;
|
||||
TR = KP866025403 * (TP - TQ);
|
||||
T1p = FNMS(KP500000000, T1o, T1n);
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
Tq = Tm + Tp;
|
||||
Tr = Tl + Tq;
|
||||
TN = TJ - TM;
|
||||
TS = TO - TR;
|
||||
TT = TN + TS;
|
||||
T2c = T1i + T1j;
|
||||
T2d = T1n + T1o;
|
||||
T2n = T2c + T2d;
|
||||
T1O = T1k - T1h;
|
||||
T1P = T1p - T1m;
|
||||
T22 = T1O + T1P;
|
||||
T1l = T1h + T1k;
|
||||
T1q = T1m + T1p;
|
||||
T1w = T1l + T1q;
|
||||
TZ = TJ + TM;
|
||||
T10 = TO + TR;
|
||||
T11 = TZ + T10;
|
||||
}
|
||||
{
|
||||
E T6, T9, Ty, T16, T17, T18, TB, T19, Tb, Te, TD, T1b, T1c, T1d, TG;
|
||||
E T1e;
|
||||
{
|
||||
E T7, T8, Tz, TA;
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 13)];
|
||||
T9 = T7 + T8;
|
||||
Ty = FNMS(KP500000000, T9, T6);
|
||||
T16 = KP866025403 * (T8 - T7);
|
||||
T17 = ii[WS(is, 3)];
|
||||
Tz = ii[WS(is, 8)];
|
||||
TA = ii[WS(is, 13)];
|
||||
T18 = Tz + TA;
|
||||
TB = KP866025403 * (Tz - TA);
|
||||
T19 = FNMS(KP500000000, T18, T17);
|
||||
}
|
||||
{
|
||||
E Tc, Td, TE, TF;
|
||||
Tb = ri[WS(is, 12)];
|
||||
Tc = ri[WS(is, 2)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = Tc + Td;
|
||||
TD = FNMS(KP500000000, Te, Tb);
|
||||
T1b = KP866025403 * (Td - Tc);
|
||||
T1c = ii[WS(is, 12)];
|
||||
TE = ii[WS(is, 2)];
|
||||
TF = ii[WS(is, 7)];
|
||||
T1d = TE + TF;
|
||||
TG = KP866025403 * (TE - TF);
|
||||
T1e = FNMS(KP500000000, T1d, T1c);
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Tf = Tb + Te;
|
||||
Tg = Ta + Tf;
|
||||
TC = Ty - TB;
|
||||
TH = TD - TG;
|
||||
TI = TC + TH;
|
||||
T2f = T17 + T18;
|
||||
T2g = T1c + T1d;
|
||||
T2m = T2f + T2g;
|
||||
T1R = T19 - T16;
|
||||
T1S = T1e - T1b;
|
||||
T21 = T1R + T1S;
|
||||
T1a = T16 + T19;
|
||||
T1f = T1b + T1e;
|
||||
T1v = T1a + T1f;
|
||||
TW = Ty + TB;
|
||||
TX = TD + TG;
|
||||
TY = TW + TX;
|
||||
}
|
||||
{
|
||||
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
|
||||
T2a = KP559016994 * (Tg - Tr);
|
||||
Ts = Tg + Tr;
|
||||
T29 = FNMS(KP250000000, Ts, T5);
|
||||
T2e = T2c - T2d;
|
||||
T2h = T2f - T2g;
|
||||
T2i = FNMS(KP587785252, T2h, KP951056516 * T2e);
|
||||
T2k = FMA(KP951056516, T2h, KP587785252 * T2e);
|
||||
ro[0] = T5 + Ts;
|
||||
T2j = T2a + T29;
|
||||
ro[WS(os, 9)] = T2j - T2k;
|
||||
ro[WS(os, 6)] = T2j + T2k;
|
||||
T2b = T29 - T2a;
|
||||
ro[WS(os, 12)] = T2b - T2i;
|
||||
ro[WS(os, 3)] = T2b + T2i;
|
||||
}
|
||||
{
|
||||
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
|
||||
T2q = KP559016994 * (T2m - T2n);
|
||||
T2o = T2m + T2n;
|
||||
T2p = FNMS(KP250000000, T2o, T2l);
|
||||
T2s = Tl - Tq;
|
||||
T2t = Ta - Tf;
|
||||
T2u = FNMS(KP587785252, T2t, KP951056516 * T2s);
|
||||
T2w = FMA(KP951056516, T2t, KP587785252 * T2s);
|
||||
io[0] = T2l + T2o;
|
||||
T2v = T2q + T2p;
|
||||
io[WS(os, 6)] = T2v - T2w;
|
||||
io[WS(os, 9)] = T2w + T2v;
|
||||
T2r = T2p - T2q;
|
||||
io[WS(os, 3)] = T2r - T2u;
|
||||
io[WS(os, 12)] = T2u + T2r;
|
||||
}
|
||||
{
|
||||
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
|
||||
T1M = KP559016994 * (TI - TT);
|
||||
TU = TI + TT;
|
||||
T1L = FNMS(KP250000000, TU, Tx);
|
||||
T1Q = T1O - T1P;
|
||||
T1T = T1R - T1S;
|
||||
T1U = FNMS(KP587785252, T1T, KP951056516 * T1Q);
|
||||
T1W = FMA(KP951056516, T1T, KP587785252 * T1Q);
|
||||
ro[WS(os, 5)] = Tx + TU;
|
||||
T1V = T1M + T1L;
|
||||
ro[WS(os, 14)] = T1V - T1W;
|
||||
ro[WS(os, 11)] = T1V + T1W;
|
||||
T1N = T1L - T1M;
|
||||
ro[WS(os, 2)] = T1N - T1U;
|
||||
ro[WS(os, 8)] = T1N + T1U;
|
||||
}
|
||||
{
|
||||
E T25, T23, T24, T1Z, T28, T1X, T1Y, T27, T26;
|
||||
T25 = KP559016994 * (T21 - T22);
|
||||
T23 = T21 + T22;
|
||||
T24 = FNMS(KP250000000, T23, T20);
|
||||
T1X = TN - TS;
|
||||
T1Y = TC - TH;
|
||||
T1Z = FNMS(KP587785252, T1Y, KP951056516 * T1X);
|
||||
T28 = FMA(KP951056516, T1Y, KP587785252 * T1X);
|
||||
io[WS(os, 5)] = T20 + T23;
|
||||
T27 = T25 + T24;
|
||||
io[WS(os, 11)] = T27 - T28;
|
||||
io[WS(os, 14)] = T28 + T27;
|
||||
T26 = T24 - T25;
|
||||
io[WS(os, 2)] = T1Z + T26;
|
||||
io[WS(os, 8)] = T26 - T1Z;
|
||||
}
|
||||
{
|
||||
E T1x, T1D, T1E, T1I, T1J, T1G, T1H, T1K, T1F;
|
||||
T1x = KP559016994 * (T1v - T1w);
|
||||
T1D = T1v + T1w;
|
||||
T1E = FNMS(KP250000000, T1D, T1C);
|
||||
T1G = TW - TX;
|
||||
T1H = TZ - T10;
|
||||
T1I = FMA(KP951056516, T1G, KP587785252 * T1H);
|
||||
T1J = FNMS(KP587785252, T1G, KP951056516 * T1H);
|
||||
io[WS(os, 10)] = T1C + T1D;
|
||||
T1K = T1E - T1x;
|
||||
io[WS(os, 7)] = T1J + T1K;
|
||||
io[WS(os, 13)] = T1K - T1J;
|
||||
T1F = T1x + T1E;
|
||||
io[WS(os, 1)] = T1F - T1I;
|
||||
io[WS(os, 4)] = T1I + T1F;
|
||||
}
|
||||
{
|
||||
E T13, T12, T14, T1s, T1u, T1g, T1r, T1t, T15;
|
||||
T13 = KP559016994 * (TY - T11);
|
||||
T12 = TY + T11;
|
||||
T14 = FNMS(KP250000000, T12, TV);
|
||||
T1g = T1a - T1f;
|
||||
T1r = T1l - T1q;
|
||||
T1s = FMA(KP951056516, T1g, KP587785252 * T1r);
|
||||
T1u = FNMS(KP587785252, T1g, KP951056516 * T1r);
|
||||
ro[WS(os, 10)] = TV + T12;
|
||||
T1t = T14 - T13;
|
||||
ro[WS(os, 7)] = T1t - T1u;
|
||||
ro[WS(os, 13)] = T1t + T1u;
|
||||
T15 = T13 + T14;
|
||||
ro[WS(os, 4)] = T15 - T1s;
|
||||
ro[WS(os, 1)] = T15 + T1s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 15, "n1_15", { 128, 28, 28, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,560 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:25 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 144 FP additions, 40 FP multiplications,
|
||||
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
|
||||
* 50 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
|
||||
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
|
||||
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
|
||||
E T1U, T1A;
|
||||
{
|
||||
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
|
||||
{
|
||||
E T1, T2, Tw, Tx;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 8)];
|
||||
T3 = T1 + T2;
|
||||
TL = T1 - T2;
|
||||
Tw = ii[0];
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = Tw + Tx;
|
||||
T1k = Tw - Tx;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tz, TA;
|
||||
T4 = ri[WS(is, 4)];
|
||||
T5 = ri[WS(is, 12)];
|
||||
T6 = T4 + T5;
|
||||
T1j = T4 - T5;
|
||||
Tz = ii[WS(is, 4)];
|
||||
TA = ii[WS(is, 12)];
|
||||
TB = Tz + TA;
|
||||
TM = Tz - TA;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1R = T3 - T6;
|
||||
T25 = Ty - TB;
|
||||
TC = Ty + TB;
|
||||
TN = TL - TM;
|
||||
T1x = TL + TM;
|
||||
T1H = T1k - T1j;
|
||||
T1l = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E Tp, T1c, T1a, T20, Ts, T17, T1f, T21;
|
||||
{
|
||||
E Tn, To, T18, T19;
|
||||
Tn = ri[WS(is, 15)];
|
||||
To = ri[WS(is, 7)];
|
||||
Tp = Tn + To;
|
||||
T1c = Tn - To;
|
||||
T18 = ii[WS(is, 15)];
|
||||
T19 = ii[WS(is, 7)];
|
||||
T1a = T18 - T19;
|
||||
T20 = T18 + T19;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T1d, T1e;
|
||||
Tq = ri[WS(is, 3)];
|
||||
Tr = ri[WS(is, 11)];
|
||||
Ts = Tq + Tr;
|
||||
T17 = Tq - Tr;
|
||||
T1d = ii[WS(is, 3)];
|
||||
T1e = ii[WS(is, 11)];
|
||||
T1f = T1d - T1e;
|
||||
T21 = T1d + T1e;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T22 = T20 - T21;
|
||||
T2h = T20 + T21;
|
||||
T1b = T17 + T1a;
|
||||
T1g = T1c - T1f;
|
||||
T1E = T1a - T17;
|
||||
T1Z = Tp - Ts;
|
||||
T1D = T1c + T1f;
|
||||
}
|
||||
{
|
||||
E Ta, TP, TF, TO, Td, TR, TI, TS;
|
||||
{
|
||||
E T8, T9, TD, TE;
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = ri[WS(is, 10)];
|
||||
Ta = T8 + T9;
|
||||
TP = T8 - T9;
|
||||
TD = ii[WS(is, 2)];
|
||||
TE = ii[WS(is, 10)];
|
||||
TF = TD + TE;
|
||||
TO = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TG, TH;
|
||||
Tb = ri[WS(is, 14)];
|
||||
Tc = ri[WS(is, 6)];
|
||||
Td = Tb + Tc;
|
||||
TR = Tb - Tc;
|
||||
TG = ii[WS(is, 14)];
|
||||
TH = ii[WS(is, 6)];
|
||||
TI = TG + TH;
|
||||
TS = TG - TH;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1S = TF - TI;
|
||||
T26 = Td - Ta;
|
||||
TJ = TF + TI;
|
||||
TQ = TO - TP;
|
||||
T1m = TR - TS;
|
||||
T1n = TP + TO;
|
||||
TT = TR + TS;
|
||||
}
|
||||
{
|
||||
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
|
||||
{
|
||||
E Tg, Th, TX, TY;
|
||||
Tg = ri[WS(is, 1)];
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = Tg + Th;
|
||||
T11 = Tg - Th;
|
||||
TX = ii[WS(is, 1)];
|
||||
TY = ii[WS(is, 9)];
|
||||
TZ = TX - TY;
|
||||
T1V = TX + TY;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, T12, T13;
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = ri[WS(is, 13)];
|
||||
Tl = Tj + Tk;
|
||||
TW = Tj - Tk;
|
||||
T12 = ii[WS(is, 5)];
|
||||
T13 = ii[WS(is, 13)];
|
||||
T14 = T12 - T13;
|
||||
T1W = T12 + T13;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1X = T1V - T1W;
|
||||
T2g = T1V + T1W;
|
||||
T10 = TW + TZ;
|
||||
T15 = T11 - T14;
|
||||
T1B = TZ - TW;
|
||||
T1U = Ti - Tl;
|
||||
T1A = T11 + T14;
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T2j, T2k;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
ro[WS(os, 8)] = Tf - Tu;
|
||||
ro[0] = Tf + Tu;
|
||||
T2j = TC + TJ;
|
||||
T2k = T2g + T2h;
|
||||
io[WS(os, 8)] = T2j - T2k;
|
||||
io[0] = T2j + T2k;
|
||||
}
|
||||
{
|
||||
E Tv, TK, T2f, T2i;
|
||||
Tv = Tt - Tm;
|
||||
TK = TC - TJ;
|
||||
io[WS(os, 4)] = Tv + TK;
|
||||
io[WS(os, 12)] = TK - Tv;
|
||||
T2f = T7 - Te;
|
||||
T2i = T2g - T2h;
|
||||
ro[WS(os, 12)] = T2f - T2i;
|
||||
ro[WS(os, 4)] = T2f + T2i;
|
||||
}
|
||||
{
|
||||
E T1T, T27, T24, T28, T1Y, T23;
|
||||
T1T = T1R + T1S;
|
||||
T27 = T25 - T26;
|
||||
T1Y = T1U + T1X;
|
||||
T23 = T1Z - T22;
|
||||
T24 = T1Y + T23;
|
||||
T28 = T23 - T1Y;
|
||||
ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T);
|
||||
io[WS(os, 6)] = FMA(KP707106781, T28, T27);
|
||||
ro[WS(os, 2)] = FMA(KP707106781, T24, T1T);
|
||||
io[WS(os, 14)] = FNMS(KP707106781, T28, T27);
|
||||
}
|
||||
{
|
||||
E T29, T2d, T2c, T2e, T2a, T2b;
|
||||
T29 = T1R - T1S;
|
||||
T2d = T26 + T25;
|
||||
T2a = T1X - T1U;
|
||||
T2b = T1Z + T22;
|
||||
T2c = T2a - T2b;
|
||||
T2e = T2a + T2b;
|
||||
ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29);
|
||||
io[WS(os, 2)] = FMA(KP707106781, T2e, T2d);
|
||||
ro[WS(os, 6)] = FMA(KP707106781, T2c, T29);
|
||||
io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d);
|
||||
}
|
||||
{
|
||||
E TV, T1v, T1p, T1r, T1i, T1q, T1u, T1w, TU, T1o;
|
||||
TU = TQ - TT;
|
||||
TV = FMA(KP707106781, TU, TN);
|
||||
T1v = FNMS(KP707106781, TU, TN);
|
||||
T1o = T1m - T1n;
|
||||
T1p = FNMS(KP707106781, T1o, T1l);
|
||||
T1r = FMA(KP707106781, T1o, T1l);
|
||||
{
|
||||
E T16, T1h, T1s, T1t;
|
||||
T16 = FMA(KP414213562, T15, T10);
|
||||
T1h = FNMS(KP414213562, T1g, T1b);
|
||||
T1i = T16 - T1h;
|
||||
T1q = T16 + T1h;
|
||||
T1s = FMA(KP414213562, T1b, T1g);
|
||||
T1t = FNMS(KP414213562, T10, T15);
|
||||
T1u = T1s - T1t;
|
||||
T1w = T1t + T1s;
|
||||
}
|
||||
ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV);
|
||||
io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r);
|
||||
ro[WS(os, 3)] = FMA(KP923879532, T1i, TV);
|
||||
io[WS(os, 3)] = FMA(KP923879532, T1u, T1r);
|
||||
io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p);
|
||||
ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v);
|
||||
io[WS(os, 15)] = FMA(KP923879532, T1q, T1p);
|
||||
ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v);
|
||||
}
|
||||
{
|
||||
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
|
||||
T1y = T1n + T1m;
|
||||
T1z = FMA(KP707106781, T1y, T1x);
|
||||
T1L = FNMS(KP707106781, T1y, T1x);
|
||||
T1I = TQ + TT;
|
||||
T1J = FNMS(KP707106781, T1I, T1H);
|
||||
T1P = FMA(KP707106781, T1I, T1H);
|
||||
{
|
||||
E T1C, T1F, T1M, T1N;
|
||||
T1C = FMA(KP414213562, T1B, T1A);
|
||||
T1F = FNMS(KP414213562, T1E, T1D);
|
||||
T1G = T1C + T1F;
|
||||
T1K = T1F - T1C;
|
||||
T1M = FNMS(KP414213562, T1A, T1B);
|
||||
T1N = FMA(KP414213562, T1D, T1E);
|
||||
T1O = T1M - T1N;
|
||||
T1Q = T1M + T1N;
|
||||
}
|
||||
ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z);
|
||||
io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P);
|
||||
ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z);
|
||||
io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P);
|
||||
io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J);
|
||||
ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L);
|
||||
io[WS(os, 5)] = FMA(KP923879532, T1K, T1J);
|
||||
ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 16, "n1_16", { 104, 0, 40, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 144 FP additions, 24 FP multiplications,
|
||||
* (or, 136 additions, 16 multiplications, 8 fused multiply/add),
|
||||
* 50 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
|
||||
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
|
||||
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
|
||||
E T1U, T1A;
|
||||
{
|
||||
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
|
||||
{
|
||||
E T1, T2, Tw, Tx;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 8)];
|
||||
T3 = T1 + T2;
|
||||
TL = T1 - T2;
|
||||
Tw = ii[0];
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = Tw + Tx;
|
||||
T1k = Tw - Tx;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tz, TA;
|
||||
T4 = ri[WS(is, 4)];
|
||||
T5 = ri[WS(is, 12)];
|
||||
T6 = T4 + T5;
|
||||
T1j = T4 - T5;
|
||||
Tz = ii[WS(is, 4)];
|
||||
TA = ii[WS(is, 12)];
|
||||
TB = Tz + TA;
|
||||
TM = Tz - TA;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1R = T3 - T6;
|
||||
T25 = Ty - TB;
|
||||
TC = Ty + TB;
|
||||
TN = TL - TM;
|
||||
T1x = TL + TM;
|
||||
T1H = T1k - T1j;
|
||||
T1l = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
|
||||
{
|
||||
E Tn, To, T1d, T1e;
|
||||
Tn = ri[WS(is, 15)];
|
||||
To = ri[WS(is, 7)];
|
||||
Tp = Tn + To;
|
||||
T17 = Tn - To;
|
||||
T1d = ii[WS(is, 15)];
|
||||
T1e = ii[WS(is, 7)];
|
||||
T1f = T1d - T1e;
|
||||
T20 = T1d + T1e;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T18, T19;
|
||||
Tq = ri[WS(is, 3)];
|
||||
Tr = ri[WS(is, 11)];
|
||||
Ts = Tq + Tr;
|
||||
T1c = Tq - Tr;
|
||||
T18 = ii[WS(is, 3)];
|
||||
T19 = ii[WS(is, 11)];
|
||||
T1a = T18 - T19;
|
||||
T21 = T18 + T19;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T22 = T20 - T21;
|
||||
T2h = T20 + T21;
|
||||
T1b = T17 - T1a;
|
||||
T1g = T1c + T1f;
|
||||
T1E = T1f - T1c;
|
||||
T1Z = Tp - Ts;
|
||||
T1D = T17 + T1a;
|
||||
}
|
||||
{
|
||||
E Ta, TP, TF, TO, Td, TR, TI, TS;
|
||||
{
|
||||
E T8, T9, TD, TE;
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = ri[WS(is, 10)];
|
||||
Ta = T8 + T9;
|
||||
TP = T8 - T9;
|
||||
TD = ii[WS(is, 2)];
|
||||
TE = ii[WS(is, 10)];
|
||||
TF = TD + TE;
|
||||
TO = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TG, TH;
|
||||
Tb = ri[WS(is, 14)];
|
||||
Tc = ri[WS(is, 6)];
|
||||
Td = Tb + Tc;
|
||||
TR = Tb - Tc;
|
||||
TG = ii[WS(is, 14)];
|
||||
TH = ii[WS(is, 6)];
|
||||
TI = TG + TH;
|
||||
TS = TG - TH;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1S = TF - TI;
|
||||
T26 = Td - Ta;
|
||||
TJ = TF + TI;
|
||||
TQ = TO - TP;
|
||||
T1m = TR - TS;
|
||||
T1n = TP + TO;
|
||||
TT = TR + TS;
|
||||
}
|
||||
{
|
||||
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
|
||||
{
|
||||
E Tg, Th, TX, TY;
|
||||
Tg = ri[WS(is, 1)];
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = Tg + Th;
|
||||
T11 = Tg - Th;
|
||||
TX = ii[WS(is, 1)];
|
||||
TY = ii[WS(is, 9)];
|
||||
TZ = TX - TY;
|
||||
T1V = TX + TY;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, T12, T13;
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = ri[WS(is, 13)];
|
||||
Tl = Tj + Tk;
|
||||
TW = Tj - Tk;
|
||||
T12 = ii[WS(is, 5)];
|
||||
T13 = ii[WS(is, 13)];
|
||||
T14 = T12 - T13;
|
||||
T1W = T12 + T13;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1X = T1V - T1W;
|
||||
T2g = T1V + T1W;
|
||||
T10 = TW + TZ;
|
||||
T15 = T11 - T14;
|
||||
T1B = T11 + T14;
|
||||
T1U = Ti - Tl;
|
||||
T1A = TZ - TW;
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T2j, T2k;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
ro[WS(os, 8)] = Tf - Tu;
|
||||
ro[0] = Tf + Tu;
|
||||
T2j = TC + TJ;
|
||||
T2k = T2g + T2h;
|
||||
io[WS(os, 8)] = T2j - T2k;
|
||||
io[0] = T2j + T2k;
|
||||
}
|
||||
{
|
||||
E Tv, TK, T2f, T2i;
|
||||
Tv = Tt - Tm;
|
||||
TK = TC - TJ;
|
||||
io[WS(os, 4)] = Tv + TK;
|
||||
io[WS(os, 12)] = TK - Tv;
|
||||
T2f = T7 - Te;
|
||||
T2i = T2g - T2h;
|
||||
ro[WS(os, 12)] = T2f - T2i;
|
||||
ro[WS(os, 4)] = T2f + T2i;
|
||||
}
|
||||
{
|
||||
E T1T, T27, T24, T28, T1Y, T23;
|
||||
T1T = T1R + T1S;
|
||||
T27 = T25 - T26;
|
||||
T1Y = T1U + T1X;
|
||||
T23 = T1Z - T22;
|
||||
T24 = KP707106781 * (T1Y + T23);
|
||||
T28 = KP707106781 * (T23 - T1Y);
|
||||
ro[WS(os, 10)] = T1T - T24;
|
||||
io[WS(os, 6)] = T27 + T28;
|
||||
ro[WS(os, 2)] = T1T + T24;
|
||||
io[WS(os, 14)] = T27 - T28;
|
||||
}
|
||||
{
|
||||
E T29, T2d, T2c, T2e, T2a, T2b;
|
||||
T29 = T1R - T1S;
|
||||
T2d = T26 + T25;
|
||||
T2a = T1X - T1U;
|
||||
T2b = T1Z + T22;
|
||||
T2c = KP707106781 * (T2a - T2b);
|
||||
T2e = KP707106781 * (T2a + T2b);
|
||||
ro[WS(os, 14)] = T29 - T2c;
|
||||
io[WS(os, 2)] = T2d + T2e;
|
||||
ro[WS(os, 6)] = T29 + T2c;
|
||||
io[WS(os, 10)] = T2d - T2e;
|
||||
}
|
||||
{
|
||||
E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
|
||||
TU = KP707106781 * (TQ - TT);
|
||||
TV = TN + TU;
|
||||
T1r = TN - TU;
|
||||
T1o = KP707106781 * (T1m - T1n);
|
||||
T1p = T1l - T1o;
|
||||
T1v = T1l + T1o;
|
||||
{
|
||||
E T16, T1h, T1s, T1t;
|
||||
T16 = FMA(KP923879532, T10, KP382683432 * T15);
|
||||
T1h = FNMS(KP923879532, T1g, KP382683432 * T1b);
|
||||
T1i = T16 + T1h;
|
||||
T1q = T1h - T16;
|
||||
T1s = FNMS(KP923879532, T15, KP382683432 * T10);
|
||||
T1t = FMA(KP382683432, T1g, KP923879532 * T1b);
|
||||
T1u = T1s - T1t;
|
||||
T1w = T1s + T1t;
|
||||
}
|
||||
ro[WS(os, 11)] = TV - T1i;
|
||||
io[WS(os, 11)] = T1v - T1w;
|
||||
ro[WS(os, 3)] = TV + T1i;
|
||||
io[WS(os, 3)] = T1v + T1w;
|
||||
io[WS(os, 15)] = T1p - T1q;
|
||||
ro[WS(os, 15)] = T1r - T1u;
|
||||
io[WS(os, 7)] = T1p + T1q;
|
||||
ro[WS(os, 7)] = T1r + T1u;
|
||||
}
|
||||
{
|
||||
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
|
||||
T1y = KP707106781 * (T1n + T1m);
|
||||
T1z = T1x + T1y;
|
||||
T1L = T1x - T1y;
|
||||
T1I = KP707106781 * (TQ + TT);
|
||||
T1J = T1H - T1I;
|
||||
T1P = T1H + T1I;
|
||||
{
|
||||
E T1C, T1F, T1M, T1N;
|
||||
T1C = FMA(KP382683432, T1A, KP923879532 * T1B);
|
||||
T1F = FNMS(KP382683432, T1E, KP923879532 * T1D);
|
||||
T1G = T1C + T1F;
|
||||
T1K = T1F - T1C;
|
||||
T1M = FNMS(KP382683432, T1B, KP923879532 * T1A);
|
||||
T1N = FMA(KP923879532, T1E, KP382683432 * T1D);
|
||||
T1O = T1M - T1N;
|
||||
T1Q = T1M + T1N;
|
||||
}
|
||||
ro[WS(os, 9)] = T1z - T1G;
|
||||
io[WS(os, 9)] = T1P - T1Q;
|
||||
ro[WS(os, 1)] = T1z + T1G;
|
||||
io[WS(os, 1)] = T1P + T1Q;
|
||||
io[WS(os, 13)] = T1J - T1K;
|
||||
ro[WS(os, 13)] = T1L - T1O;
|
||||
io[WS(os, 5)] = T1J + T1K;
|
||||
ro[WS(os, 5)] = T1L + T1O;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 16, "n1_16", { 136, 16, 8, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 0 FP multiplications,
|
||||
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 5 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 1)];
|
||||
ro[WS(os, 1)] = T1 - T2;
|
||||
ro[0] = T1 + T2;
|
||||
T3 = ii[0];
|
||||
T4 = ii[WS(is, 1)];
|
||||
io[WS(os, 1)] = T3 - T4;
|
||||
io[0] = T3 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 0 FP multiplications,
|
||||
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 5 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 1)];
|
||||
ro[WS(os, 1)] = T1 - T2;
|
||||
ro[0] = T1 + T2;
|
||||
T3 = ii[0];
|
||||
T4 = ii[WS(is, 1)];
|
||||
io[WS(os, 1)] = T3 - T4;
|
||||
io[0] = T3 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,718 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 208 FP additions, 72 FP multiplications,
|
||||
* (or, 136 additions, 0 multiplications, 72 fused multiply/add),
|
||||
* 81 stack variables, 4 constants, and 80 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
|
||||
E T7, T2N, T3b, TD, TP, T1R, T2f, T1d, Tt, TA, TB, T2w, T2z, T2P, T35;
|
||||
E T36, T3d, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1T, T29, T2a, T2h, T1h;
|
||||
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2O, T32, T33, T3c, TE, TF, TG, TU;
|
||||
E TZ, T10, T1D, T1I, T1S, T26, T27, T2g, T1e, T1f, T1g;
|
||||
{
|
||||
E T3, T1N, TN, T2L, T6, TO, T1Q, T2M;
|
||||
{
|
||||
E T1, T2, TL, TM;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 10)];
|
||||
T3 = T1 + T2;
|
||||
T1N = T1 - T2;
|
||||
TL = ii[0];
|
||||
TM = ii[WS(is, 10)];
|
||||
TN = TL - TM;
|
||||
T2L = TL + TM;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1O, T1P;
|
||||
T4 = ri[WS(is, 5)];
|
||||
T5 = ri[WS(is, 15)];
|
||||
T6 = T4 + T5;
|
||||
TO = T4 - T5;
|
||||
T1O = ii[WS(is, 5)];
|
||||
T1P = ii[WS(is, 15)];
|
||||
T1Q = T1O - T1P;
|
||||
T2M = T1O + T1P;
|
||||
}
|
||||
T7 = T3 - T6;
|
||||
T2N = T2L - T2M;
|
||||
T3b = T2L + T2M;
|
||||
TD = T3 + T6;
|
||||
TP = TN - TO;
|
||||
T1R = T1N - T1Q;
|
||||
T2f = T1N + T1Q;
|
||||
T1d = TO + TN;
|
||||
}
|
||||
{
|
||||
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
|
||||
E T2y;
|
||||
{
|
||||
E Tn, To, T11, T12;
|
||||
Tn = ri[WS(is, 8)];
|
||||
To = ri[WS(is, 18)];
|
||||
Tp = Tn + To;
|
||||
T1o = Tn - To;
|
||||
T11 = ii[WS(is, 8)];
|
||||
T12 = ii[WS(is, 18)];
|
||||
T13 = T11 - T12;
|
||||
T2u = T11 + T12;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T1p, T1q;
|
||||
Tq = ri[WS(is, 13)];
|
||||
Tr = ri[WS(is, 3)];
|
||||
Ts = Tq + Tr;
|
||||
T14 = Tq - Tr;
|
||||
T1p = ii[WS(is, 13)];
|
||||
T1q = ii[WS(is, 3)];
|
||||
T1r = T1p - T1q;
|
||||
T2v = T1p + T1q;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, T16, T17;
|
||||
Tu = ri[WS(is, 12)];
|
||||
Tv = ri[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
T1t = Tu - Tv;
|
||||
T16 = ii[WS(is, 12)];
|
||||
T17 = ii[WS(is, 2)];
|
||||
T18 = T16 - T17;
|
||||
T2x = T16 + T17;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, T1u, T1v;
|
||||
Tx = ri[WS(is, 17)];
|
||||
Ty = ri[WS(is, 7)];
|
||||
Tz = Tx + Ty;
|
||||
T19 = Tx - Ty;
|
||||
T1u = ii[WS(is, 17)];
|
||||
T1v = ii[WS(is, 7)];
|
||||
T1w = T1u - T1v;
|
||||
T2y = T1u + T1v;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
TB = Tt + TA;
|
||||
T2w = T2u - T2v;
|
||||
T2z = T2x - T2y;
|
||||
T2P = T2w + T2z;
|
||||
T35 = T2u + T2v;
|
||||
T36 = T2x + T2y;
|
||||
T3d = T35 + T36;
|
||||
TH = Tp + Ts;
|
||||
TI = Tw + Tz;
|
||||
TJ = TH + TI;
|
||||
T15 = T13 - T14;
|
||||
T1a = T18 - T19;
|
||||
T1b = T15 + T1a;
|
||||
T1s = T1o - T1r;
|
||||
T1x = T1t - T1w;
|
||||
T1T = T1s + T1x;
|
||||
T29 = T1o + T1r;
|
||||
T2a = T1t + T1w;
|
||||
T2h = T29 + T2a;
|
||||
T1h = T14 + T13;
|
||||
T1i = T19 + T18;
|
||||
T1j = T1h + T1i;
|
||||
}
|
||||
{
|
||||
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
|
||||
E T2F;
|
||||
{
|
||||
E T8, T9, TQ, TR;
|
||||
T8 = ri[WS(is, 4)];
|
||||
T9 = ri[WS(is, 14)];
|
||||
Ta = T8 + T9;
|
||||
T1z = T8 - T9;
|
||||
TQ = ii[WS(is, 4)];
|
||||
TR = ii[WS(is, 14)];
|
||||
TS = TQ - TR;
|
||||
T2B = TQ + TR;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, T1A, T1B;
|
||||
Tb = ri[WS(is, 9)];
|
||||
Tc = ri[WS(is, 19)];
|
||||
Td = Tb + Tc;
|
||||
TT = Tb - Tc;
|
||||
T1A = ii[WS(is, 9)];
|
||||
T1B = ii[WS(is, 19)];
|
||||
T1C = T1A - T1B;
|
||||
T2C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tf, Tg, TV, TW;
|
||||
Tf = ri[WS(is, 16)];
|
||||
Tg = ri[WS(is, 6)];
|
||||
Th = Tf + Tg;
|
||||
T1E = Tf - Tg;
|
||||
TV = ii[WS(is, 16)];
|
||||
TW = ii[WS(is, 6)];
|
||||
TX = TV - TW;
|
||||
T2E = TV + TW;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, T1F, T1G;
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 11)];
|
||||
Tk = Ti + Tj;
|
||||
TY = Ti - Tj;
|
||||
T1F = ii[WS(is, 1)];
|
||||
T1G = ii[WS(is, 11)];
|
||||
T1H = T1F - T1G;
|
||||
T2F = T1F + T1G;
|
||||
}
|
||||
Te = Ta - Td;
|
||||
Tl = Th - Tk;
|
||||
Tm = Te + Tl;
|
||||
T2D = T2B - T2C;
|
||||
T2G = T2E - T2F;
|
||||
T2O = T2D + T2G;
|
||||
T32 = T2B + T2C;
|
||||
T33 = T2E + T2F;
|
||||
T3c = T32 + T33;
|
||||
TE = Ta + Td;
|
||||
TF = Th + Tk;
|
||||
TG = TE + TF;
|
||||
TU = TS - TT;
|
||||
TZ = TX - TY;
|
||||
T10 = TU + TZ;
|
||||
T1D = T1z - T1C;
|
||||
T1I = T1E - T1H;
|
||||
T1S = T1D + T1I;
|
||||
T26 = T1z + T1C;
|
||||
T27 = T1E + T1H;
|
||||
T2g = T26 + T27;
|
||||
T1e = TT + TS;
|
||||
T1f = TY + TX;
|
||||
T1g = T1e + T1f;
|
||||
}
|
||||
{
|
||||
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
|
||||
T2s = Tm - TB;
|
||||
TC = Tm + TB;
|
||||
T2r = FNMS(KP250000000, TC, T7);
|
||||
T2A = T2w - T2z;
|
||||
T2H = T2D - T2G;
|
||||
T2I = FNMS(KP618033988, T2H, T2A);
|
||||
T2K = FMA(KP618033988, T2A, T2H);
|
||||
ro[WS(os, 10)] = T7 + TC;
|
||||
T2J = FMA(KP559016994, T2s, T2r);
|
||||
ro[WS(os, 14)] = FNMS(KP951056516, T2K, T2J);
|
||||
ro[WS(os, 6)] = FMA(KP951056516, T2K, T2J);
|
||||
T2t = FNMS(KP559016994, T2s, T2r);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, T2I, T2t);
|
||||
ro[WS(os, 18)] = FMA(KP951056516, T2I, T2t);
|
||||
}
|
||||
{
|
||||
E T2S, T2Q, T2R, T2W, T2Y, T2U, T2V, T2X, T2T;
|
||||
T2S = T2O - T2P;
|
||||
T2Q = T2O + T2P;
|
||||
T2R = FNMS(KP250000000, T2Q, T2N);
|
||||
T2U = Tt - TA;
|
||||
T2V = Te - Tl;
|
||||
T2W = FNMS(KP618033988, T2V, T2U);
|
||||
T2Y = FMA(KP618033988, T2U, T2V);
|
||||
io[WS(os, 10)] = T2N + T2Q;
|
||||
T2X = FMA(KP559016994, T2S, T2R);
|
||||
io[WS(os, 6)] = FNMS(KP951056516, T2Y, T2X);
|
||||
io[WS(os, 14)] = FMA(KP951056516, T2Y, T2X);
|
||||
T2T = FNMS(KP559016994, T2S, T2R);
|
||||
io[WS(os, 2)] = FMA(KP951056516, T2W, T2T);
|
||||
io[WS(os, 18)] = FNMS(KP951056516, T2W, T2T);
|
||||
}
|
||||
{
|
||||
E T30, TK, T2Z, T38, T3a, T34, T37, T39, T31;
|
||||
T30 = TG - TJ;
|
||||
TK = TG + TJ;
|
||||
T2Z = FNMS(KP250000000, TK, TD);
|
||||
T34 = T32 - T33;
|
||||
T37 = T35 - T36;
|
||||
T38 = FMA(KP618033988, T37, T34);
|
||||
T3a = FNMS(KP618033988, T34, T37);
|
||||
ro[0] = TD + TK;
|
||||
T39 = FNMS(KP559016994, T30, T2Z);
|
||||
ro[WS(os, 12)] = FNMS(KP951056516, T3a, T39);
|
||||
ro[WS(os, 8)] = FMA(KP951056516, T3a, T39);
|
||||
T31 = FMA(KP559016994, T30, T2Z);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, T38, T31);
|
||||
ro[WS(os, 16)] = FMA(KP951056516, T38, T31);
|
||||
}
|
||||
{
|
||||
E T3g, T3e, T3f, T3k, T3m, T3i, T3j, T3l, T3h;
|
||||
T3g = T3c - T3d;
|
||||
T3e = T3c + T3d;
|
||||
T3f = FNMS(KP250000000, T3e, T3b);
|
||||
T3i = TE - TF;
|
||||
T3j = TH - TI;
|
||||
T3k = FMA(KP618033988, T3j, T3i);
|
||||
T3m = FNMS(KP618033988, T3i, T3j);
|
||||
io[0] = T3b + T3e;
|
||||
T3l = FNMS(KP559016994, T3g, T3f);
|
||||
io[WS(os, 8)] = FNMS(KP951056516, T3m, T3l);
|
||||
io[WS(os, 12)] = FMA(KP951056516, T3m, T3l);
|
||||
T3h = FMA(KP559016994, T3g, T3f);
|
||||
io[WS(os, 4)] = FMA(KP951056516, T3k, T3h);
|
||||
io[WS(os, 16)] = FNMS(KP951056516, T3k, T3h);
|
||||
}
|
||||
{
|
||||
E T24, T1c, T23, T2c, T2e, T28, T2b, T2d, T25;
|
||||
T24 = T10 - T1b;
|
||||
T1c = T10 + T1b;
|
||||
T23 = FNMS(KP250000000, T1c, TP);
|
||||
T28 = T26 - T27;
|
||||
T2b = T29 - T2a;
|
||||
T2c = FMA(KP618033988, T2b, T28);
|
||||
T2e = FNMS(KP618033988, T28, T2b);
|
||||
io[WS(os, 5)] = TP + T1c;
|
||||
T2d = FNMS(KP559016994, T24, T23);
|
||||
io[WS(os, 13)] = FNMS(KP951056516, T2e, T2d);
|
||||
io[WS(os, 17)] = FMA(KP951056516, T2e, T2d);
|
||||
T25 = FMA(KP559016994, T24, T23);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, T2c, T25);
|
||||
io[WS(os, 9)] = FMA(KP951056516, T2c, T25);
|
||||
}
|
||||
{
|
||||
E T2k, T2i, T2j, T2o, T2q, T2m, T2n, T2p, T2l;
|
||||
T2k = T2g - T2h;
|
||||
T2i = T2g + T2h;
|
||||
T2j = FNMS(KP250000000, T2i, T2f);
|
||||
T2m = TU - TZ;
|
||||
T2n = T15 - T1a;
|
||||
T2o = FMA(KP618033988, T2n, T2m);
|
||||
T2q = FNMS(KP618033988, T2m, T2n);
|
||||
ro[WS(os, 5)] = T2f + T2i;
|
||||
T2p = FNMS(KP559016994, T2k, T2j);
|
||||
ro[WS(os, 13)] = FMA(KP951056516, T2q, T2p);
|
||||
ro[WS(os, 17)] = FNMS(KP951056516, T2q, T2p);
|
||||
T2l = FMA(KP559016994, T2k, T2j);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, T2o, T2l);
|
||||
ro[WS(os, 9)] = FNMS(KP951056516, T2o, T2l);
|
||||
}
|
||||
{
|
||||
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
|
||||
T1m = T1g - T1j;
|
||||
T1k = T1g + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1d);
|
||||
T1y = T1s - T1x;
|
||||
T1J = T1D - T1I;
|
||||
T1K = FNMS(KP618033988, T1J, T1y);
|
||||
T1M = FMA(KP618033988, T1y, T1J);
|
||||
io[WS(os, 15)] = T1d + T1k;
|
||||
T1L = FMA(KP559016994, T1m, T1l);
|
||||
io[WS(os, 11)] = FNMS(KP951056516, T1M, T1L);
|
||||
io[WS(os, 19)] = FMA(KP951056516, T1M, T1L);
|
||||
T1n = FNMS(KP559016994, T1m, T1l);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, T1K, T1n);
|
||||
io[WS(os, 7)] = FMA(KP951056516, T1K, T1n);
|
||||
}
|
||||
{
|
||||
E T1W, T1U, T1V, T20, T22, T1Y, T1Z, T21, T1X;
|
||||
T1W = T1S - T1T;
|
||||
T1U = T1S + T1T;
|
||||
T1V = FNMS(KP250000000, T1U, T1R);
|
||||
T1Y = T1h - T1i;
|
||||
T1Z = T1e - T1f;
|
||||
T20 = FNMS(KP618033988, T1Z, T1Y);
|
||||
T22 = FMA(KP618033988, T1Y, T1Z);
|
||||
ro[WS(os, 15)] = T1R + T1U;
|
||||
T21 = FMA(KP559016994, T1W, T1V);
|
||||
ro[WS(os, 11)] = FMA(KP951056516, T22, T21);
|
||||
ro[WS(os, 19)] = FNMS(KP951056516, T22, T21);
|
||||
T1X = FNMS(KP559016994, T1W, T1V);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, T20, T1X);
|
||||
ro[WS(os, 7)] = FNMS(KP951056516, T20, T1X);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 20, "n1_20", { 136, 0, 72, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 208 FP additions, 48 FP multiplications,
|
||||
* (or, 184 additions, 24 multiplications, 24 fused multiply/add),
|
||||
* 81 stack variables, 4 constants, and 80 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
|
||||
E T7, T2Q, T3h, TD, TP, T1U, T2l, T1d, Tt, TA, TB, T2w, T2z, T2S, T35;
|
||||
E T36, T3f, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1W, T29, T2a, T2j, T1h;
|
||||
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2R, T32, T33, T3e, TE, TF, TG, TU;
|
||||
E TZ, T10, T1D, T1I, T1V, T26, T27, T2i, T1e, T1f, T1g;
|
||||
{
|
||||
E T3, T1Q, TN, T2O, T6, TO, T1T, T2P;
|
||||
{
|
||||
E T1, T2, TL, TM;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 10)];
|
||||
T3 = T1 + T2;
|
||||
T1Q = T1 - T2;
|
||||
TL = ii[0];
|
||||
TM = ii[WS(is, 10)];
|
||||
TN = TL - TM;
|
||||
T2O = TL + TM;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1R, T1S;
|
||||
T4 = ri[WS(is, 5)];
|
||||
T5 = ri[WS(is, 15)];
|
||||
T6 = T4 + T5;
|
||||
TO = T4 - T5;
|
||||
T1R = ii[WS(is, 5)];
|
||||
T1S = ii[WS(is, 15)];
|
||||
T1T = T1R - T1S;
|
||||
T2P = T1R + T1S;
|
||||
}
|
||||
T7 = T3 - T6;
|
||||
T2Q = T2O - T2P;
|
||||
T3h = T2O + T2P;
|
||||
TD = T3 + T6;
|
||||
TP = TN - TO;
|
||||
T1U = T1Q - T1T;
|
||||
T2l = T1Q + T1T;
|
||||
T1d = TO + TN;
|
||||
}
|
||||
{
|
||||
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
|
||||
E T2y;
|
||||
{
|
||||
E Tn, To, T11, T12;
|
||||
Tn = ri[WS(is, 8)];
|
||||
To = ri[WS(is, 18)];
|
||||
Tp = Tn + To;
|
||||
T1o = Tn - To;
|
||||
T11 = ii[WS(is, 8)];
|
||||
T12 = ii[WS(is, 18)];
|
||||
T13 = T11 - T12;
|
||||
T2u = T11 + T12;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T1p, T1q;
|
||||
Tq = ri[WS(is, 13)];
|
||||
Tr = ri[WS(is, 3)];
|
||||
Ts = Tq + Tr;
|
||||
T14 = Tq - Tr;
|
||||
T1p = ii[WS(is, 13)];
|
||||
T1q = ii[WS(is, 3)];
|
||||
T1r = T1p - T1q;
|
||||
T2v = T1p + T1q;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, T16, T17;
|
||||
Tu = ri[WS(is, 12)];
|
||||
Tv = ri[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
T1t = Tu - Tv;
|
||||
T16 = ii[WS(is, 12)];
|
||||
T17 = ii[WS(is, 2)];
|
||||
T18 = T16 - T17;
|
||||
T2x = T16 + T17;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, T1u, T1v;
|
||||
Tx = ri[WS(is, 17)];
|
||||
Ty = ri[WS(is, 7)];
|
||||
Tz = Tx + Ty;
|
||||
T19 = Tx - Ty;
|
||||
T1u = ii[WS(is, 17)];
|
||||
T1v = ii[WS(is, 7)];
|
||||
T1w = T1u - T1v;
|
||||
T2y = T1u + T1v;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
TB = Tt + TA;
|
||||
T2w = T2u - T2v;
|
||||
T2z = T2x - T2y;
|
||||
T2S = T2w + T2z;
|
||||
T35 = T2u + T2v;
|
||||
T36 = T2x + T2y;
|
||||
T3f = T35 + T36;
|
||||
TH = Tp + Ts;
|
||||
TI = Tw + Tz;
|
||||
TJ = TH + TI;
|
||||
T15 = T13 - T14;
|
||||
T1a = T18 - T19;
|
||||
T1b = T15 + T1a;
|
||||
T1s = T1o - T1r;
|
||||
T1x = T1t - T1w;
|
||||
T1W = T1s + T1x;
|
||||
T29 = T1o + T1r;
|
||||
T2a = T1t + T1w;
|
||||
T2j = T29 + T2a;
|
||||
T1h = T14 + T13;
|
||||
T1i = T19 + T18;
|
||||
T1j = T1h + T1i;
|
||||
}
|
||||
{
|
||||
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
|
||||
E T2F;
|
||||
{
|
||||
E T8, T9, TQ, TR;
|
||||
T8 = ri[WS(is, 4)];
|
||||
T9 = ri[WS(is, 14)];
|
||||
Ta = T8 + T9;
|
||||
T1z = T8 - T9;
|
||||
TQ = ii[WS(is, 4)];
|
||||
TR = ii[WS(is, 14)];
|
||||
TS = TQ - TR;
|
||||
T2B = TQ + TR;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, T1A, T1B;
|
||||
Tb = ri[WS(is, 9)];
|
||||
Tc = ri[WS(is, 19)];
|
||||
Td = Tb + Tc;
|
||||
TT = Tb - Tc;
|
||||
T1A = ii[WS(is, 9)];
|
||||
T1B = ii[WS(is, 19)];
|
||||
T1C = T1A - T1B;
|
||||
T2C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tf, Tg, TV, TW;
|
||||
Tf = ri[WS(is, 16)];
|
||||
Tg = ri[WS(is, 6)];
|
||||
Th = Tf + Tg;
|
||||
T1E = Tf - Tg;
|
||||
TV = ii[WS(is, 16)];
|
||||
TW = ii[WS(is, 6)];
|
||||
TX = TV - TW;
|
||||
T2E = TV + TW;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, T1F, T1G;
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 11)];
|
||||
Tk = Ti + Tj;
|
||||
TY = Ti - Tj;
|
||||
T1F = ii[WS(is, 1)];
|
||||
T1G = ii[WS(is, 11)];
|
||||
T1H = T1F - T1G;
|
||||
T2F = T1F + T1G;
|
||||
}
|
||||
Te = Ta - Td;
|
||||
Tl = Th - Tk;
|
||||
Tm = Te + Tl;
|
||||
T2D = T2B - T2C;
|
||||
T2G = T2E - T2F;
|
||||
T2R = T2D + T2G;
|
||||
T32 = T2B + T2C;
|
||||
T33 = T2E + T2F;
|
||||
T3e = T32 + T33;
|
||||
TE = Ta + Td;
|
||||
TF = Th + Tk;
|
||||
TG = TE + TF;
|
||||
TU = TS - TT;
|
||||
TZ = TX - TY;
|
||||
T10 = TU + TZ;
|
||||
T1D = T1z - T1C;
|
||||
T1I = T1E - T1H;
|
||||
T1V = T1D + T1I;
|
||||
T26 = T1z + T1C;
|
||||
T27 = T1E + T1H;
|
||||
T2i = T26 + T27;
|
||||
T1e = TT + TS;
|
||||
T1f = TY + TX;
|
||||
T1g = T1e + T1f;
|
||||
}
|
||||
{
|
||||
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
|
||||
T2s = KP559016994 * (Tm - TB);
|
||||
TC = Tm + TB;
|
||||
T2r = FNMS(KP250000000, TC, T7);
|
||||
T2A = T2w - T2z;
|
||||
T2H = T2D - T2G;
|
||||
T2I = FNMS(KP587785252, T2H, KP951056516 * T2A);
|
||||
T2K = FMA(KP951056516, T2H, KP587785252 * T2A);
|
||||
ro[WS(os, 10)] = T7 + TC;
|
||||
T2J = T2s + T2r;
|
||||
ro[WS(os, 14)] = T2J - T2K;
|
||||
ro[WS(os, 6)] = T2J + T2K;
|
||||
T2t = T2r - T2s;
|
||||
ro[WS(os, 2)] = T2t - T2I;
|
||||
ro[WS(os, 18)] = T2t + T2I;
|
||||
}
|
||||
{
|
||||
E T2V, T2T, T2U, T2N, T2Y, T2L, T2M, T2X, T2W;
|
||||
T2V = KP559016994 * (T2R - T2S);
|
||||
T2T = T2R + T2S;
|
||||
T2U = FNMS(KP250000000, T2T, T2Q);
|
||||
T2L = Tt - TA;
|
||||
T2M = Te - Tl;
|
||||
T2N = FNMS(KP587785252, T2M, KP951056516 * T2L);
|
||||
T2Y = FMA(KP951056516, T2M, KP587785252 * T2L);
|
||||
io[WS(os, 10)] = T2Q + T2T;
|
||||
T2X = T2V + T2U;
|
||||
io[WS(os, 6)] = T2X - T2Y;
|
||||
io[WS(os, 14)] = T2Y + T2X;
|
||||
T2W = T2U - T2V;
|
||||
io[WS(os, 2)] = T2N + T2W;
|
||||
io[WS(os, 18)] = T2W - T2N;
|
||||
}
|
||||
{
|
||||
E T2Z, TK, T30, T38, T3a, T34, T37, T39, T31;
|
||||
T2Z = KP559016994 * (TG - TJ);
|
||||
TK = TG + TJ;
|
||||
T30 = FNMS(KP250000000, TK, TD);
|
||||
T34 = T32 - T33;
|
||||
T37 = T35 - T36;
|
||||
T38 = FMA(KP951056516, T34, KP587785252 * T37);
|
||||
T3a = FNMS(KP587785252, T34, KP951056516 * T37);
|
||||
ro[0] = TD + TK;
|
||||
T39 = T30 - T2Z;
|
||||
ro[WS(os, 12)] = T39 - T3a;
|
||||
ro[WS(os, 8)] = T39 + T3a;
|
||||
T31 = T2Z + T30;
|
||||
ro[WS(os, 4)] = T31 - T38;
|
||||
ro[WS(os, 16)] = T31 + T38;
|
||||
}
|
||||
{
|
||||
E T3g, T3i, T3j, T3d, T3m, T3b, T3c, T3l, T3k;
|
||||
T3g = KP559016994 * (T3e - T3f);
|
||||
T3i = T3e + T3f;
|
||||
T3j = FNMS(KP250000000, T3i, T3h);
|
||||
T3b = TE - TF;
|
||||
T3c = TH - TI;
|
||||
T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
|
||||
T3m = FNMS(KP587785252, T3b, KP951056516 * T3c);
|
||||
io[0] = T3h + T3i;
|
||||
T3l = T3j - T3g;
|
||||
io[WS(os, 8)] = T3l - T3m;
|
||||
io[WS(os, 12)] = T3m + T3l;
|
||||
T3k = T3g + T3j;
|
||||
io[WS(os, 4)] = T3d + T3k;
|
||||
io[WS(os, 16)] = T3k - T3d;
|
||||
}
|
||||
{
|
||||
E T23, T1c, T24, T2c, T2e, T28, T2b, T2d, T25;
|
||||
T23 = KP559016994 * (T10 - T1b);
|
||||
T1c = T10 + T1b;
|
||||
T24 = FNMS(KP250000000, T1c, TP);
|
||||
T28 = T26 - T27;
|
||||
T2b = T29 - T2a;
|
||||
T2c = FMA(KP951056516, T28, KP587785252 * T2b);
|
||||
T2e = FNMS(KP587785252, T28, KP951056516 * T2b);
|
||||
io[WS(os, 5)] = TP + T1c;
|
||||
T2d = T24 - T23;
|
||||
io[WS(os, 13)] = T2d - T2e;
|
||||
io[WS(os, 17)] = T2d + T2e;
|
||||
T25 = T23 + T24;
|
||||
io[WS(os, 1)] = T25 - T2c;
|
||||
io[WS(os, 9)] = T25 + T2c;
|
||||
}
|
||||
{
|
||||
E T2k, T2m, T2n, T2h, T2p, T2f, T2g, T2q, T2o;
|
||||
T2k = KP559016994 * (T2i - T2j);
|
||||
T2m = T2i + T2j;
|
||||
T2n = FNMS(KP250000000, T2m, T2l);
|
||||
T2f = TU - TZ;
|
||||
T2g = T15 - T1a;
|
||||
T2h = FMA(KP951056516, T2f, KP587785252 * T2g);
|
||||
T2p = FNMS(KP587785252, T2f, KP951056516 * T2g);
|
||||
ro[WS(os, 5)] = T2l + T2m;
|
||||
T2q = T2n - T2k;
|
||||
ro[WS(os, 13)] = T2p + T2q;
|
||||
ro[WS(os, 17)] = T2q - T2p;
|
||||
T2o = T2k + T2n;
|
||||
ro[WS(os, 1)] = T2h + T2o;
|
||||
ro[WS(os, 9)] = T2o - T2h;
|
||||
}
|
||||
{
|
||||
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
|
||||
T1m = KP559016994 * (T1g - T1j);
|
||||
T1k = T1g + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1d);
|
||||
T1y = T1s - T1x;
|
||||
T1J = T1D - T1I;
|
||||
T1K = FNMS(KP587785252, T1J, KP951056516 * T1y);
|
||||
T1M = FMA(KP951056516, T1J, KP587785252 * T1y);
|
||||
io[WS(os, 15)] = T1d + T1k;
|
||||
T1L = T1m + T1l;
|
||||
io[WS(os, 11)] = T1L - T1M;
|
||||
io[WS(os, 19)] = T1L + T1M;
|
||||
T1n = T1l - T1m;
|
||||
io[WS(os, 3)] = T1n - T1K;
|
||||
io[WS(os, 7)] = T1n + T1K;
|
||||
}
|
||||
{
|
||||
E T1Z, T1X, T1Y, T1P, T21, T1N, T1O, T22, T20;
|
||||
T1Z = KP559016994 * (T1V - T1W);
|
||||
T1X = T1V + T1W;
|
||||
T1Y = FNMS(KP250000000, T1X, T1U);
|
||||
T1N = T1h - T1i;
|
||||
T1O = T1e - T1f;
|
||||
T1P = FNMS(KP587785252, T1O, KP951056516 * T1N);
|
||||
T21 = FMA(KP951056516, T1O, KP587785252 * T1N);
|
||||
ro[WS(os, 15)] = T1U + T1X;
|
||||
T22 = T1Z + T1Y;
|
||||
ro[WS(os, 11)] = T21 + T22;
|
||||
ro[WS(os, 19)] = T22 - T21;
|
||||
T20 = T1Y - T1Z;
|
||||
ro[WS(os, 3)] = T1P + T20;
|
||||
ro[WS(os, 7)] = T20 - T1P;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 20, "n1_20", { 184, 24, 24, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 6 FP multiplications,
|
||||
* (or, 6 additions, 0 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
|
||||
E T1, T9, T4, Tc, T8, Ta, T5, Tb;
|
||||
T1 = ri[0];
|
||||
T9 = ii[0];
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 2)];
|
||||
T4 = T2 + T3;
|
||||
Tc = T3 - T2;
|
||||
T6 = ii[WS(is, 1)];
|
||||
T7 = ii[WS(is, 2)];
|
||||
T8 = T6 - T7;
|
||||
Ta = T6 + T7;
|
||||
}
|
||||
ro[0] = T1 + T4;
|
||||
io[0] = T9 + Ta;
|
||||
T5 = FNMS(KP500000000, T4, T1);
|
||||
ro[WS(os, 2)] = FNMS(KP866025403, T8, T5);
|
||||
ro[WS(os, 1)] = FMA(KP866025403, T8, T5);
|
||||
Tb = FNMS(KP500000000, Ta, T9);
|
||||
io[WS(os, 1)] = FMA(KP866025403, Tc, Tb);
|
||||
io[WS(os, 2)] = FNMS(KP866025403, Tc, Tb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 3, "n1_3", { 6, 0, 6, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 4 FP multiplications,
|
||||
* (or, 10 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
|
||||
E T1, Ta, T4, T9, T8, Tb, T5, Tc;
|
||||
T1 = ri[0];
|
||||
Ta = ii[0];
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 2)];
|
||||
T4 = T2 + T3;
|
||||
T9 = KP866025403 * (T3 - T2);
|
||||
T6 = ii[WS(is, 1)];
|
||||
T7 = ii[WS(is, 2)];
|
||||
T8 = KP866025403 * (T6 - T7);
|
||||
Tb = T6 + T7;
|
||||
}
|
||||
ro[0] = T1 + T4;
|
||||
io[0] = Ta + Tb;
|
||||
T5 = FNMS(KP500000000, T4, T1);
|
||||
ro[WS(os, 2)] = T5 - T8;
|
||||
ro[WS(os, 1)] = T5 + T8;
|
||||
Tc = FNMS(KP500000000, Tb, Ta);
|
||||
io[WS(os, 1)] = T9 + Tc;
|
||||
io[WS(os, 2)] = Tc - T9;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 3, "n1_3", { 10, 2, 2, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 0 FP multiplications,
|
||||
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
|
||||
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
|
||||
{
|
||||
E T1, T2, T7, T8;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 2)];
|
||||
T3 = T1 + T2;
|
||||
Tb = T1 - T2;
|
||||
T7 = ii[0];
|
||||
T8 = ii[WS(is, 2)];
|
||||
T9 = T7 - T8;
|
||||
Tf = T7 + T8;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tc, Td;
|
||||
T4 = ri[WS(is, 1)];
|
||||
T5 = ri[WS(is, 3)];
|
||||
T6 = T4 + T5;
|
||||
Ta = T4 - T5;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 3)];
|
||||
Te = Tc - Td;
|
||||
Tg = Tc + Td;
|
||||
}
|
||||
ro[WS(os, 2)] = T3 - T6;
|
||||
io[WS(os, 2)] = Tf - Tg;
|
||||
ro[0] = T3 + T6;
|
||||
io[0] = Tf + Tg;
|
||||
io[WS(os, 1)] = T9 - Ta;
|
||||
ro[WS(os, 1)] = Tb + Te;
|
||||
io[WS(os, 3)] = Ta + T9;
|
||||
ro[WS(os, 3)] = Tb - Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 0 FP multiplications,
|
||||
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
|
||||
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
|
||||
{
|
||||
E T1, T2, T7, T8;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 2)];
|
||||
T3 = T1 + T2;
|
||||
Tb = T1 - T2;
|
||||
T7 = ii[0];
|
||||
T8 = ii[WS(is, 2)];
|
||||
T9 = T7 - T8;
|
||||
Tf = T7 + T8;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tc, Td;
|
||||
T4 = ri[WS(is, 1)];
|
||||
T5 = ri[WS(is, 3)];
|
||||
T6 = T4 + T5;
|
||||
Ta = T4 - T5;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 3)];
|
||||
Te = Tc - Td;
|
||||
Tg = Tc + Td;
|
||||
}
|
||||
ro[WS(os, 2)] = T3 - T6;
|
||||
io[WS(os, 2)] = Tf - Tg;
|
||||
ro[0] = T3 + T6;
|
||||
io[0] = Tf + Tg;
|
||||
io[WS(os, 1)] = T9 - Ta;
|
||||
ro[WS(os, 1)] = Tb + Te;
|
||||
io[WS(os, 3)] = Ta + T9;
|
||||
ro[WS(os, 3)] = Tb - Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 18 FP multiplications,
|
||||
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
|
||||
* 21 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
|
||||
E T1, Tl, T8, Tt, Ta, Ts, Te, Tq, Th, To;
|
||||
T1 = ri[0];
|
||||
Tl = ii[0];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 4)];
|
||||
T4 = T2 + T3;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = T5 + T6;
|
||||
T8 = T4 + T7;
|
||||
Tt = T5 - T6;
|
||||
Ta = T4 - T7;
|
||||
Ts = T2 - T3;
|
||||
}
|
||||
{
|
||||
E Tc, Td, Tm, Tf, Tg, Tn;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 4)];
|
||||
Tm = Tc + Td;
|
||||
Tf = ii[WS(is, 2)];
|
||||
Tg = ii[WS(is, 3)];
|
||||
Tn = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Tq = Tm - Tn;
|
||||
Th = Tf - Tg;
|
||||
To = Tm + Tn;
|
||||
}
|
||||
ro[0] = T1 + T8;
|
||||
io[0] = Tl + To;
|
||||
{
|
||||
E Ti, Tk, Tb, Tj, T9;
|
||||
Ti = FMA(KP618033988, Th, Te);
|
||||
Tk = FNMS(KP618033988, Te, Th);
|
||||
T9 = FNMS(KP250000000, T8, T1);
|
||||
Tb = FMA(KP559016994, Ta, T9);
|
||||
Tj = FNMS(KP559016994, Ta, T9);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, Ti, Tb);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, Tk, Tj);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, Ti, Tb);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, Tk, Tj);
|
||||
}
|
||||
{
|
||||
E Tu, Tw, Tr, Tv, Tp;
|
||||
Tu = FMA(KP618033988, Tt, Ts);
|
||||
Tw = FNMS(KP618033988, Ts, Tt);
|
||||
Tp = FNMS(KP250000000, To, Tl);
|
||||
Tr = FMA(KP559016994, Tq, Tp);
|
||||
Tv = FNMS(KP559016994, Tq, Tp);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, Tu, Tr);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, Tw, Tv);
|
||||
io[WS(os, 4)] = FMA(KP951056516, Tu, Tr);
|
||||
io[WS(os, 2)] = FMA(KP951056516, Tw, Tv);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 5, "n1_5", { 14, 0, 18, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 12 FP multiplications,
|
||||
* (or, 26 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 21 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
|
||||
E T1, To, T8, Tt, T9, Ts, Te, Tp, Th, Tn;
|
||||
T1 = ri[0];
|
||||
To = ii[0];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 4)];
|
||||
T4 = T2 + T3;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = T5 + T6;
|
||||
T8 = T4 + T7;
|
||||
Tt = T5 - T6;
|
||||
T9 = KP559016994 * (T4 - T7);
|
||||
Ts = T2 - T3;
|
||||
}
|
||||
{
|
||||
E Tc, Td, Tl, Tf, Tg, Tm;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 4)];
|
||||
Tl = Tc + Td;
|
||||
Tf = ii[WS(is, 2)];
|
||||
Tg = ii[WS(is, 3)];
|
||||
Tm = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Tp = Tl + Tm;
|
||||
Th = Tf - Tg;
|
||||
Tn = KP559016994 * (Tl - Tm);
|
||||
}
|
||||
ro[0] = T1 + T8;
|
||||
io[0] = To + Tp;
|
||||
{
|
||||
E Ti, Tk, Tb, Tj, Ta;
|
||||
Ti = FMA(KP951056516, Te, KP587785252 * Th);
|
||||
Tk = FNMS(KP587785252, Te, KP951056516 * Th);
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
Tb = T9 + Ta;
|
||||
Tj = Ta - T9;
|
||||
ro[WS(os, 4)] = Tb - Ti;
|
||||
ro[WS(os, 3)] = Tj + Tk;
|
||||
ro[WS(os, 1)] = Tb + Ti;
|
||||
ro[WS(os, 2)] = Tj - Tk;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, Tr, Tw, Tq;
|
||||
Tu = FMA(KP951056516, Ts, KP587785252 * Tt);
|
||||
Tv = FNMS(KP587785252, Ts, KP951056516 * Tt);
|
||||
Tq = FNMS(KP250000000, Tp, To);
|
||||
Tr = Tn + Tq;
|
||||
Tw = Tq - Tn;
|
||||
io[WS(os, 1)] = Tr - Tu;
|
||||
io[WS(os, 3)] = Tw - Tv;
|
||||
io[WS(os, 4)] = Tu + Tr;
|
||||
io[WS(os, 2)] = Tv + Tw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 5, "n1_5", { 26, 6, 6, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 36 FP additions, 12 FP multiplications,
|
||||
* (or, 24 additions, 0 multiplications, 12 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
|
||||
E T3, Tb, Tp, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tq;
|
||||
E Ty;
|
||||
{
|
||||
E T1, T2, Tn, To;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = T1 - T2;
|
||||
Tb = T1 + T2;
|
||||
Tn = ii[0];
|
||||
To = ii[WS(is, 3)];
|
||||
Tp = Tn - To;
|
||||
Tx = Tn + To;
|
||||
}
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 5)];
|
||||
T6 = T4 - T5;
|
||||
Tc = T4 + T5;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = T7 - T8;
|
||||
Td = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Te = Tc + Td;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ii[WS(is, 2)];
|
||||
Th = ii[WS(is, 5)];
|
||||
Ti = Tg - Th;
|
||||
Tu = Tg + Th;
|
||||
Tj = ii[WS(is, 4)];
|
||||
Tk = ii[WS(is, 1)];
|
||||
Tl = Tj - Tk;
|
||||
Tv = Tj + Tk;
|
||||
}
|
||||
Tq = Ti + Tl;
|
||||
Ty = Tu + Tv;
|
||||
ro[WS(os, 3)] = T3 + Ta;
|
||||
io[WS(os, 3)] = Tp + Tq;
|
||||
ro[0] = Tb + Te;
|
||||
io[0] = Tx + Ty;
|
||||
{
|
||||
E Tf, Tm, Tr, Ts;
|
||||
Tf = FNMS(KP500000000, Ta, T3);
|
||||
Tm = Ti - Tl;
|
||||
ro[WS(os, 5)] = FNMS(KP866025403, Tm, Tf);
|
||||
ro[WS(os, 1)] = FMA(KP866025403, Tm, Tf);
|
||||
Tr = FNMS(KP500000000, Tq, Tp);
|
||||
Ts = T9 - T6;
|
||||
io[WS(os, 1)] = FMA(KP866025403, Ts, Tr);
|
||||
io[WS(os, 5)] = FNMS(KP866025403, Ts, Tr);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tz, TA;
|
||||
Tt = FNMS(KP500000000, Te, Tb);
|
||||
Tw = Tu - Tv;
|
||||
ro[WS(os, 2)] = FNMS(KP866025403, Tw, Tt);
|
||||
ro[WS(os, 4)] = FMA(KP866025403, Tw, Tt);
|
||||
Tz = FNMS(KP500000000, Ty, Tx);
|
||||
TA = Td - Tc;
|
||||
io[WS(os, 2)] = FNMS(KP866025403, TA, Tz);
|
||||
io[WS(os, 4)] = FMA(KP866025403, TA, Tz);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 6, "n1_6", { 24, 0, 12, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 36 FP additions, 8 FP multiplications,
|
||||
* (or, 32 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
|
||||
E T3, Tb, Tq, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tr;
|
||||
E Ty;
|
||||
{
|
||||
E T1, T2, To, Tp;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = T1 - T2;
|
||||
Tb = T1 + T2;
|
||||
To = ii[0];
|
||||
Tp = ii[WS(is, 3)];
|
||||
Tq = To - Tp;
|
||||
Tx = To + Tp;
|
||||
}
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 5)];
|
||||
T6 = T4 - T5;
|
||||
Tc = T4 + T5;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = T7 - T8;
|
||||
Td = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Te = Tc + Td;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ii[WS(is, 2)];
|
||||
Th = ii[WS(is, 5)];
|
||||
Ti = Tg - Th;
|
||||
Tu = Tg + Th;
|
||||
Tj = ii[WS(is, 4)];
|
||||
Tk = ii[WS(is, 1)];
|
||||
Tl = Tj - Tk;
|
||||
Tv = Tj + Tk;
|
||||
}
|
||||
Tr = Ti + Tl;
|
||||
Ty = Tu + Tv;
|
||||
ro[WS(os, 3)] = T3 + Ta;
|
||||
io[WS(os, 3)] = Tq + Tr;
|
||||
ro[0] = Tb + Te;
|
||||
io[0] = Tx + Ty;
|
||||
{
|
||||
E Tf, Tm, Tn, Ts;
|
||||
Tf = FNMS(KP500000000, Ta, T3);
|
||||
Tm = KP866025403 * (Ti - Tl);
|
||||
ro[WS(os, 5)] = Tf - Tm;
|
||||
ro[WS(os, 1)] = Tf + Tm;
|
||||
Tn = KP866025403 * (T9 - T6);
|
||||
Ts = FNMS(KP500000000, Tr, Tq);
|
||||
io[WS(os, 1)] = Tn + Ts;
|
||||
io[WS(os, 5)] = Ts - Tn;
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tz, TA;
|
||||
Tt = FNMS(KP500000000, Te, Tb);
|
||||
Tw = KP866025403 * (Tu - Tv);
|
||||
ro[WS(os, 2)] = Tt - Tw;
|
||||
ro[WS(os, 4)] = Tt + Tw;
|
||||
Tz = FNMS(KP500000000, Ty, Tx);
|
||||
TA = KP866025403 * (Td - Tc);
|
||||
io[WS(os, 2)] = Tz - TA;
|
||||
io[WS(os, 4)] = TA + Tz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 6, "n1_6", { 32, 4, 4, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,249 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 60 FP additions, 42 FP multiplications,
|
||||
* (or, 18 additions, 0 multiplications, 42 fused multiply/add),
|
||||
* 41 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
|
||||
E T1, Tz, T4, TI, Ta, TG, T7, TH, Tb, Tp, TT, TO, TJ, Tu, Tg;
|
||||
E TB, Tm, TC, Tj, TA, Tn, Ts, TQ, TL, TD, Tx;
|
||||
T1 = ri[0];
|
||||
Tz = ii[0];
|
||||
{
|
||||
E T2, T3, Te, Tf;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
TI = T3 - T2;
|
||||
{
|
||||
E T8, T9, T5, T6;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
TG = T9 - T8;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 5)];
|
||||
T7 = T5 + T6;
|
||||
TH = T6 - T5;
|
||||
}
|
||||
Tb = FNMS(KP356895867, T7, T4);
|
||||
Tp = FNMS(KP356895867, T4, Ta);
|
||||
TT = FMA(KP554958132, TG, TI);
|
||||
TO = FMA(KP554958132, TH, TG);
|
||||
TJ = FNMS(KP554958132, TI, TH);
|
||||
Tu = FNMS(KP356895867, Ta, T7);
|
||||
Te = ii[WS(is, 2)];
|
||||
Tf = ii[WS(is, 5)];
|
||||
Tg = Te - Tf;
|
||||
TB = Te + Tf;
|
||||
{
|
||||
E Tk, Tl, Th, Ti;
|
||||
Tk = ii[WS(is, 3)];
|
||||
Tl = ii[WS(is, 4)];
|
||||
Tm = Tk - Tl;
|
||||
TC = Tk + Tl;
|
||||
Th = ii[WS(is, 1)];
|
||||
Ti = ii[WS(is, 6)];
|
||||
Tj = Th - Ti;
|
||||
TA = Th + Ti;
|
||||
}
|
||||
Tn = FMA(KP554958132, Tm, Tj);
|
||||
Ts = FMA(KP554958132, Tg, Tm);
|
||||
TQ = FNMS(KP356895867, TB, TA);
|
||||
TL = FNMS(KP356895867, TA, TC);
|
||||
TD = FNMS(KP356895867, TC, TB);
|
||||
Tx = FNMS(KP554958132, Tj, Tg);
|
||||
}
|
||||
ro[0] = T1 + T4 + T7 + Ta;
|
||||
io[0] = Tz + TA + TB + TC;
|
||||
{
|
||||
E To, Td, Tc, TU, TS, TR;
|
||||
To = FMA(KP801937735, Tn, Tg);
|
||||
Tc = FNMS(KP692021471, Tb, Ta);
|
||||
Td = FNMS(KP900968867, Tc, T1);
|
||||
ro[WS(os, 6)] = FNMS(KP974927912, To, Td);
|
||||
ro[WS(os, 1)] = FMA(KP974927912, To, Td);
|
||||
TU = FMA(KP801937735, TT, TH);
|
||||
TR = FNMS(KP692021471, TQ, TC);
|
||||
TS = FNMS(KP900968867, TR, Tz);
|
||||
io[WS(os, 1)] = FMA(KP974927912, TU, TS);
|
||||
io[WS(os, 6)] = FNMS(KP974927912, TU, TS);
|
||||
}
|
||||
{
|
||||
E Tt, Tr, Tq, TP, TN, TM;
|
||||
Tt = FNMS(KP801937735, Ts, Tj);
|
||||
Tq = FNMS(KP692021471, Tp, T7);
|
||||
Tr = FNMS(KP900968867, Tq, T1);
|
||||
ro[WS(os, 5)] = FNMS(KP974927912, Tt, Tr);
|
||||
ro[WS(os, 2)] = FMA(KP974927912, Tt, Tr);
|
||||
TP = FNMS(KP801937735, TO, TI);
|
||||
TM = FNMS(KP692021471, TL, TB);
|
||||
TN = FNMS(KP900968867, TM, Tz);
|
||||
io[WS(os, 2)] = FMA(KP974927912, TP, TN);
|
||||
io[WS(os, 5)] = FNMS(KP974927912, TP, TN);
|
||||
}
|
||||
{
|
||||
E Ty, Tw, Tv, TK, TF, TE;
|
||||
Ty = FNMS(KP801937735, Tx, Tm);
|
||||
Tv = FNMS(KP692021471, Tu, T4);
|
||||
Tw = FNMS(KP900968867, Tv, T1);
|
||||
ro[WS(os, 4)] = FNMS(KP974927912, Ty, Tw);
|
||||
ro[WS(os, 3)] = FMA(KP974927912, Ty, Tw);
|
||||
TK = FNMS(KP801937735, TJ, TG);
|
||||
TE = FNMS(KP692021471, TD, TA);
|
||||
TF = FNMS(KP900968867, TE, Tz);
|
||||
io[WS(os, 3)] = FMA(KP974927912, TK, TF);
|
||||
io[WS(os, 4)] = FNMS(KP974927912, TK, TF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 7, "n1_7", { 18, 0, 42, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 60 FP additions, 36 FP multiplications,
|
||||
* (or, 36 additions, 12 multiplications, 24 fused multiply/add),
|
||||
* 25 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
|
||||
E T1, Tu, T4, Tq, Te, Tx, T7, Ts, Tk, Tv, Ta, Tr, Th, Tw;
|
||||
T1 = ri[0];
|
||||
Tu = ii[0];
|
||||
{
|
||||
E T2, T3, Tc, Td;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
Tq = T3 - T2;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 6)];
|
||||
Te = Tc - Td;
|
||||
Tx = Tc + Td;
|
||||
}
|
||||
{
|
||||
E T5, T6, Ti, Tj;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 5)];
|
||||
T7 = T5 + T6;
|
||||
Ts = T6 - T5;
|
||||
Ti = ii[WS(is, 2)];
|
||||
Tj = ii[WS(is, 5)];
|
||||
Tk = Ti - Tj;
|
||||
Tv = Ti + Tj;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tf, Tg;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
Tr = T9 - T8;
|
||||
Tf = ii[WS(is, 3)];
|
||||
Tg = ii[WS(is, 4)];
|
||||
Th = Tf - Tg;
|
||||
Tw = Tf + Tg;
|
||||
}
|
||||
ro[0] = T1 + T4 + T7 + Ta;
|
||||
io[0] = Tu + Tx + Tv + Tw;
|
||||
{
|
||||
E Tl, Tb, TB, TC;
|
||||
Tl = FNMS(KP781831482, Th, KP974927912 * Te) - (KP433883739 * Tk);
|
||||
Tb = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
|
||||
ro[WS(os, 5)] = Tb - Tl;
|
||||
ro[WS(os, 2)] = Tb + Tl;
|
||||
TB = FNMS(KP781831482, Tr, KP974927912 * Tq) - (KP433883739 * Ts);
|
||||
TC = FMA(KP623489801, Tw, Tu) + FNMA(KP900968867, Tv, KP222520933 * Tx);
|
||||
io[WS(os, 2)] = TB + TC;
|
||||
io[WS(os, 5)] = TC - TB;
|
||||
}
|
||||
{
|
||||
E Tn, Tm, Tz, TA;
|
||||
Tn = FMA(KP781831482, Te, KP974927912 * Tk) + (KP433883739 * Th);
|
||||
Tm = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
|
||||
ro[WS(os, 6)] = Tm - Tn;
|
||||
ro[WS(os, 1)] = Tm + Tn;
|
||||
Tz = FMA(KP781831482, Tq, KP974927912 * Ts) + (KP433883739 * Tr);
|
||||
TA = FMA(KP623489801, Tx, Tu) + FNMA(KP900968867, Tw, KP222520933 * Tv);
|
||||
io[WS(os, 1)] = Tz + TA;
|
||||
io[WS(os, 6)] = TA - Tz;
|
||||
}
|
||||
{
|
||||
E Tp, To, Tt, Ty;
|
||||
Tp = FMA(KP433883739, Te, KP974927912 * Th) - (KP781831482 * Tk);
|
||||
To = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
|
||||
ro[WS(os, 4)] = To - Tp;
|
||||
ro[WS(os, 3)] = To + Tp;
|
||||
Tt = FMA(KP433883739, Tq, KP974927912 * Tr) - (KP781831482 * Ts);
|
||||
Ty = FMA(KP623489801, Tv, Tu) + FNMA(KP222520933, Tw, KP900968867 * Tx);
|
||||
io[WS(os, 3)] = Tt + Ty;
|
||||
io[WS(os, 4)] = Ty - Tt;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 7, "n1_7", { 36, 12, 24, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,266 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 52 FP additions, 8 FP multiplications,
|
||||
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
|
||||
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
|
||||
E TG;
|
||||
{
|
||||
E T1, T2, Tj, Tk;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = T1 + T2;
|
||||
Tn = T1 - T2;
|
||||
{
|
||||
E Tg, Th, T4, T5;
|
||||
Tg = ii[0];
|
||||
Th = ii[WS(is, 4)];
|
||||
Ti = Tg + Th;
|
||||
TC = Tg - Th;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 6)];
|
||||
T6 = T4 + T5;
|
||||
TB = T4 - T5;
|
||||
}
|
||||
Tj = ii[WS(is, 2)];
|
||||
Tk = ii[WS(is, 6)];
|
||||
Tl = Tj + Tk;
|
||||
To = Tj - Tk;
|
||||
{
|
||||
E Tb, Tc, Tv, Tw, Tx, Ty;
|
||||
Tb = ri[WS(is, 7)];
|
||||
Tc = ri[WS(is, 3)];
|
||||
Tv = Tb - Tc;
|
||||
Tw = ii[WS(is, 7)];
|
||||
Tx = ii[WS(is, 3)];
|
||||
Ty = Tw - Tx;
|
||||
Td = Tb + Tc;
|
||||
TN = Tw + Tx;
|
||||
Tz = Tv - Ty;
|
||||
TH = Tv + Ty;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tq, Tr, Ts, Tt;
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = ri[WS(is, 5)];
|
||||
Tq = T8 - T9;
|
||||
Tr = ii[WS(is, 1)];
|
||||
Ts = ii[WS(is, 5)];
|
||||
Tt = Tr - Ts;
|
||||
Ta = T8 + T9;
|
||||
TM = Tr + Ts;
|
||||
Tu = Tq + Tt;
|
||||
TG = Tt - Tq;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Te, TP, TQ;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
ro[WS(os, 4)] = T7 - Te;
|
||||
ro[0] = T7 + Te;
|
||||
TP = Ti + Tl;
|
||||
TQ = TM + TN;
|
||||
io[WS(os, 4)] = TP - TQ;
|
||||
io[0] = TP + TQ;
|
||||
}
|
||||
{
|
||||
E Tf, Tm, TL, TO;
|
||||
Tf = Td - Ta;
|
||||
Tm = Ti - Tl;
|
||||
io[WS(os, 2)] = Tf + Tm;
|
||||
io[WS(os, 6)] = Tm - Tf;
|
||||
TL = T3 - T6;
|
||||
TO = TM - TN;
|
||||
ro[WS(os, 6)] = TL - TO;
|
||||
ro[WS(os, 2)] = TL + TO;
|
||||
}
|
||||
{
|
||||
E Tp, TA, TJ, TK;
|
||||
Tp = Tn + To;
|
||||
TA = Tu + Tz;
|
||||
ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
|
||||
ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
|
||||
TJ = TC - TB;
|
||||
TK = TG + TH;
|
||||
io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
|
||||
io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
|
||||
}
|
||||
{
|
||||
E TD, TE, TF, TI;
|
||||
TD = TB + TC;
|
||||
TE = Tz - Tu;
|
||||
io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
|
||||
io[WS(os, 3)] = FMA(KP707106781, TE, TD);
|
||||
TF = Tn - To;
|
||||
TI = TG - TH;
|
||||
ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
|
||||
ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 8, "n1_8", { 44, 0, 8, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 52 FP additions, 4 FP multiplications,
|
||||
* (or, 52 additions, 4 multiplications, 0 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
|
||||
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
|
||||
E TG;
|
||||
{
|
||||
E T1, T2, Tj, Tk;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = T1 + T2;
|
||||
Tn = T1 - T2;
|
||||
{
|
||||
E Tg, Th, T4, T5;
|
||||
Tg = ii[0];
|
||||
Th = ii[WS(is, 4)];
|
||||
Ti = Tg + Th;
|
||||
TC = Tg - Th;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 6)];
|
||||
T6 = T4 + T5;
|
||||
TB = T4 - T5;
|
||||
}
|
||||
Tj = ii[WS(is, 2)];
|
||||
Tk = ii[WS(is, 6)];
|
||||
Tl = Tj + Tk;
|
||||
To = Tj - Tk;
|
||||
{
|
||||
E Tb, Tc, Tv, Tw, Tx, Ty;
|
||||
Tb = ri[WS(is, 7)];
|
||||
Tc = ri[WS(is, 3)];
|
||||
Tv = Tb - Tc;
|
||||
Tw = ii[WS(is, 7)];
|
||||
Tx = ii[WS(is, 3)];
|
||||
Ty = Tw - Tx;
|
||||
Td = Tb + Tc;
|
||||
TN = Tw + Tx;
|
||||
Tz = Tv - Ty;
|
||||
TH = Tv + Ty;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tq, Tr, Ts, Tt;
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = ri[WS(is, 5)];
|
||||
Tq = T8 - T9;
|
||||
Tr = ii[WS(is, 1)];
|
||||
Ts = ii[WS(is, 5)];
|
||||
Tt = Tr - Ts;
|
||||
Ta = T8 + T9;
|
||||
TM = Tr + Ts;
|
||||
Tu = Tq + Tt;
|
||||
TG = Tt - Tq;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Te, TP, TQ;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
ro[WS(os, 4)] = T7 - Te;
|
||||
ro[0] = T7 + Te;
|
||||
TP = Ti + Tl;
|
||||
TQ = TM + TN;
|
||||
io[WS(os, 4)] = TP - TQ;
|
||||
io[0] = TP + TQ;
|
||||
}
|
||||
{
|
||||
E Tf, Tm, TL, TO;
|
||||
Tf = Td - Ta;
|
||||
Tm = Ti - Tl;
|
||||
io[WS(os, 2)] = Tf + Tm;
|
||||
io[WS(os, 6)] = Tm - Tf;
|
||||
TL = T3 - T6;
|
||||
TO = TM - TN;
|
||||
ro[WS(os, 6)] = TL - TO;
|
||||
ro[WS(os, 2)] = TL + TO;
|
||||
}
|
||||
{
|
||||
E Tp, TA, TJ, TK;
|
||||
Tp = Tn + To;
|
||||
TA = KP707106781 * (Tu + Tz);
|
||||
ro[WS(os, 5)] = Tp - TA;
|
||||
ro[WS(os, 1)] = Tp + TA;
|
||||
TJ = TC - TB;
|
||||
TK = KP707106781 * (TG + TH);
|
||||
io[WS(os, 5)] = TJ - TK;
|
||||
io[WS(os, 1)] = TJ + TK;
|
||||
}
|
||||
{
|
||||
E TD, TE, TF, TI;
|
||||
TD = TB + TC;
|
||||
TE = KP707106781 * (Tz - Tu);
|
||||
io[WS(os, 7)] = TD - TE;
|
||||
io[WS(os, 3)] = TD + TE;
|
||||
TF = Tn - To;
|
||||
TI = KP707106781 * (TG - TH);
|
||||
ro[WS(os, 7)] = TF - TI;
|
||||
ro[WS(os, 3)] = TF + TI;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 8, "n1_8", { 52, 4, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,360 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 80 FP additions, 56 FP multiplications,
|
||||
* (or, 24 additions, 0 multiplications, 56 fused multiply/add),
|
||||
* 41 stack variables, 10 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
|
||||
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
|
||||
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
|
||||
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
|
||||
E T5, TL, Tm, Tl, T1f, TM, Ta, T1c, TF, TW, TI, TX, Tf, T1d, Ts;
|
||||
E TZ, Tx, T10;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TL = FNMS(KP500000000, T4, T1);
|
||||
Tm = T3 - T2;
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = ii[0];
|
||||
Ti = ii[WS(is, 3)];
|
||||
Tj = ii[WS(is, 6)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = FNMS(KP500000000, Tk, Th);
|
||||
T1f = Th + Tk;
|
||||
TM = Ti - Tj;
|
||||
}
|
||||
{
|
||||
E T6, Tz, T9, TE, TC, TH, TD, TG;
|
||||
T6 = ri[WS(is, 1)];
|
||||
Tz = ii[WS(is, 1)];
|
||||
{
|
||||
E T7, T8, TA, TB;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 7)];
|
||||
T9 = T7 + T8;
|
||||
TE = T7 - T8;
|
||||
TA = ii[WS(is, 4)];
|
||||
TB = ii[WS(is, 7)];
|
||||
TC = TA + TB;
|
||||
TH = TB - TA;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1c = Tz + TC;
|
||||
TD = FNMS(KP500000000, TC, Tz);
|
||||
TF = FNMS(KP866025403, TE, TD);
|
||||
TW = FMA(KP866025403, TE, TD);
|
||||
TG = FNMS(KP500000000, T9, T6);
|
||||
TI = FNMS(KP866025403, TH, TG);
|
||||
TX = FMA(KP866025403, TH, TG);
|
||||
}
|
||||
{
|
||||
E Tb, Tt, Te, Tw, Tr, Tu, To, Tv;
|
||||
Tb = ri[WS(is, 2)];
|
||||
Tt = ii[WS(is, 2)];
|
||||
{
|
||||
E Tc, Td, Tp, Tq;
|
||||
Tc = ri[WS(is, 5)];
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = Tc + Td;
|
||||
Tw = Td - Tc;
|
||||
Tp = ii[WS(is, 5)];
|
||||
Tq = ii[WS(is, 8)];
|
||||
Tr = Tp - Tq;
|
||||
Tu = Tp + Tq;
|
||||
}
|
||||
Tf = Tb + Te;
|
||||
T1d = Tt + Tu;
|
||||
To = FNMS(KP500000000, Te, Tb);
|
||||
Ts = FMA(KP866025403, Tr, To);
|
||||
TZ = FNMS(KP866025403, Tr, To);
|
||||
Tv = FNMS(KP500000000, Tu, Tt);
|
||||
Tx = FMA(KP866025403, Tw, Tv);
|
||||
T10 = FNMS(KP866025403, Tw, Tv);
|
||||
}
|
||||
{
|
||||
E T1e, Tg, T1b, T1i, T1g, T1h;
|
||||
T1e = T1c - T1d;
|
||||
Tg = Ta + Tf;
|
||||
T1b = FNMS(KP500000000, Tg, T5);
|
||||
ro[0] = T5 + Tg;
|
||||
ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
|
||||
ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
|
||||
T1i = Tf - Ta;
|
||||
T1g = T1c + T1d;
|
||||
T1h = FNMS(KP500000000, T1g, T1f);
|
||||
io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
|
||||
io[0] = T1f + T1g;
|
||||
io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
|
||||
}
|
||||
{
|
||||
E Tn, TN, TK, TS, TQ, TU, TR, TT;
|
||||
Tn = FMA(KP866025403, Tm, Tl);
|
||||
TN = FMA(KP866025403, TM, TL);
|
||||
{
|
||||
E Ty, TJ, TO, TP;
|
||||
Ty = FNMS(KP176326980, Tx, Ts);
|
||||
TJ = FNMS(KP839099631, TI, TF);
|
||||
TK = FNMS(KP777861913, TJ, Ty);
|
||||
TS = FMA(KP777861913, TJ, Ty);
|
||||
TO = FMA(KP176326980, Ts, Tx);
|
||||
TP = FMA(KP839099631, TF, TI);
|
||||
TQ = FMA(KP777861913, TP, TO);
|
||||
TU = FNMS(KP777861913, TP, TO);
|
||||
}
|
||||
io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
|
||||
ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
|
||||
TR = FNMS(KP492403876, TQ, TN);
|
||||
ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
|
||||
ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
|
||||
TT = FMA(KP492403876, TK, Tn);
|
||||
io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
|
||||
io[WS(os, 4)] = FMA(KP852868531, TU, TT);
|
||||
}
|
||||
{
|
||||
E TV, T17, T12, T1a, T16, T18, T13, T19;
|
||||
TV = FNMS(KP866025403, TM, TL);
|
||||
T17 = FNMS(KP866025403, Tm, Tl);
|
||||
{
|
||||
E TY, T11, T14, T15;
|
||||
TY = FMA(KP176326980, TX, TW);
|
||||
T11 = FNMS(KP363970234, T10, TZ);
|
||||
T12 = FNMS(KP954188894, T11, TY);
|
||||
T1a = FMA(KP954188894, T11, TY);
|
||||
T14 = FNMS(KP176326980, TW, TX);
|
||||
T15 = FMA(KP363970234, TZ, T10);
|
||||
T16 = FNMS(KP954188894, T15, T14);
|
||||
T18 = FMA(KP954188894, T15, T14);
|
||||
}
|
||||
ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
|
||||
io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
|
||||
T13 = FNMS(KP492403876, T12, TV);
|
||||
ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
|
||||
ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
|
||||
T19 = FMA(KP492403876, T18, T17);
|
||||
io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
|
||||
io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 9, "n1_9", { 24, 0, 56, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 80 FP additions, 40 FP multiplications,
|
||||
* (or, 60 additions, 20 multiplications, 20 fused multiply/add),
|
||||
* 39 stack variables, 8 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
|
||||
E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
|
||||
E T10, TG, TZ;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TO = KP866025403 * (T3 - T2);
|
||||
Th = FNMS(KP500000000, T4, T1);
|
||||
}
|
||||
{
|
||||
E TP, Ti, Tj, TQ;
|
||||
TP = ii[0];
|
||||
Ti = ii[WS(is, 3)];
|
||||
Tj = ii[WS(is, 6)];
|
||||
TQ = Ti + Tj;
|
||||
Tk = KP866025403 * (Ti - Tj);
|
||||
T1g = TP + TQ;
|
||||
TR = FNMS(KP500000000, TQ, TP);
|
||||
}
|
||||
{
|
||||
E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
|
||||
T6 = ri[WS(is, 1)];
|
||||
Ts = ii[WS(is, 1)];
|
||||
{
|
||||
E T7, T8, Tn, To;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 7)];
|
||||
T9 = T7 + T8;
|
||||
Tr = KP866025403 * (T8 - T7);
|
||||
Tn = ii[WS(is, 4)];
|
||||
To = ii[WS(is, 7)];
|
||||
Tp = KP866025403 * (Tn - To);
|
||||
Tt = Tn + To;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1c = Ts + Tt;
|
||||
Tm = FNMS(KP500000000, T9, T6);
|
||||
Tq = Tm + Tp;
|
||||
TW = Tm - Tp;
|
||||
Tu = FNMS(KP500000000, Tt, Ts);
|
||||
Tv = Tr + Tu;
|
||||
TX = Tu - Tr;
|
||||
}
|
||||
{
|
||||
E Tb, TD, Te, TC, TA, TE, Tx, TF;
|
||||
Tb = ri[WS(is, 2)];
|
||||
TD = ii[WS(is, 2)];
|
||||
{
|
||||
E Tc, Td, Ty, Tz;
|
||||
Tc = ri[WS(is, 5)];
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = Tc + Td;
|
||||
TC = KP866025403 * (Td - Tc);
|
||||
Ty = ii[WS(is, 5)];
|
||||
Tz = ii[WS(is, 8)];
|
||||
TA = KP866025403 * (Ty - Tz);
|
||||
TE = Ty + Tz;
|
||||
}
|
||||
Tf = Tb + Te;
|
||||
T1d = TD + TE;
|
||||
Tx = FNMS(KP500000000, Te, Tb);
|
||||
TB = Tx + TA;
|
||||
T10 = Tx - TA;
|
||||
TF = FNMS(KP500000000, TE, TD);
|
||||
TG = TC + TF;
|
||||
TZ = TF - TC;
|
||||
}
|
||||
{
|
||||
E T1e, Tg, T1b, T1f, T1h, T1i;
|
||||
T1e = KP866025403 * (T1c - T1d);
|
||||
Tg = Ta + Tf;
|
||||
T1b = FNMS(KP500000000, Tg, T5);
|
||||
ro[0] = T5 + Tg;
|
||||
ro[WS(os, 3)] = T1b + T1e;
|
||||
ro[WS(os, 6)] = T1b - T1e;
|
||||
T1f = KP866025403 * (Tf - Ta);
|
||||
T1h = T1c + T1d;
|
||||
T1i = FNMS(KP500000000, T1h, T1g);
|
||||
io[WS(os, 3)] = T1f + T1i;
|
||||
io[0] = T1g + T1h;
|
||||
io[WS(os, 6)] = T1i - T1f;
|
||||
}
|
||||
{
|
||||
E Tl, TS, TI, TN, TM, TT, TJ, TU;
|
||||
Tl = Th + Tk;
|
||||
TS = TO + TR;
|
||||
{
|
||||
E Tw, TH, TK, TL;
|
||||
Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
|
||||
TH = FMA(KP173648177, TB, KP984807753 * TG);
|
||||
TI = Tw + TH;
|
||||
TN = KP866025403 * (TH - Tw);
|
||||
TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
|
||||
TL = FNMS(KP984807753, TB, KP173648177 * TG);
|
||||
TM = KP866025403 * (TK - TL);
|
||||
TT = TK + TL;
|
||||
}
|
||||
ro[WS(os, 1)] = Tl + TI;
|
||||
io[WS(os, 1)] = TS + TT;
|
||||
TJ = FNMS(KP500000000, TI, Tl);
|
||||
ro[WS(os, 7)] = TJ - TM;
|
||||
ro[WS(os, 4)] = TJ + TM;
|
||||
TU = FNMS(KP500000000, TT, TS);
|
||||
io[WS(os, 4)] = TN + TU;
|
||||
io[WS(os, 7)] = TU - TN;
|
||||
}
|
||||
{
|
||||
E TV, T14, T12, T13, T17, T1a, T18, T19;
|
||||
TV = Th - Tk;
|
||||
T14 = TR - TO;
|
||||
{
|
||||
E TY, T11, T15, T16;
|
||||
TY = FMA(KP173648177, TW, KP984807753 * TX);
|
||||
T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
|
||||
T12 = TY + T11;
|
||||
T13 = KP866025403 * (T11 - TY);
|
||||
T15 = FNMS(KP984807753, TW, KP173648177 * TX);
|
||||
T16 = FMA(KP342020143, T10, KP939692620 * TZ);
|
||||
T17 = T15 - T16;
|
||||
T1a = KP866025403 * (T15 + T16);
|
||||
}
|
||||
ro[WS(os, 2)] = TV + T12;
|
||||
io[WS(os, 2)] = T14 + T17;
|
||||
T18 = FNMS(KP500000000, T17, T14);
|
||||
io[WS(os, 5)] = T13 + T18;
|
||||
io[WS(os, 8)] = T18 - T13;
|
||||
T19 = FNMS(KP500000000, T12, TV);
|
||||
ro[WS(os, 8)] = T19 - T1a;
|
||||
ro[WS(os, 5)] = T19 + T1a;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 9, "n1_9", { 60, 20, 20, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,149 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 8 FP multiplications,
|
||||
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 17 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T2, T4, T7, T8, T9, Tb, Tc, Te, Th, Ti, Tj;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T4 = T1 - T2;
|
||||
T7 = iio[0];
|
||||
T8 = iio[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
Tb = rio[WS(vs, 1)];
|
||||
Tc = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Te = Tb - Tc;
|
||||
Th = iio[WS(vs, 1)];
|
||||
Ti = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tj = Th - Ti;
|
||||
rio[0] = T1 + T2;
|
||||
iio[0] = T7 + T8;
|
||||
rio[WS(rs, 1)] = Tb + Tc;
|
||||
iio[WS(rs, 1)] = Th + Ti;
|
||||
{
|
||||
E Tf, Tk, Td, Tg;
|
||||
Td = W[0];
|
||||
Tf = Td * Te;
|
||||
Tk = Td * Tj;
|
||||
Tg = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tg, Tj, Tf);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tg, Te, Tk);
|
||||
}
|
||||
{
|
||||
E T5, Ta, T3, T6;
|
||||
T3 = W[0];
|
||||
T5 = T3 * T4;
|
||||
Ta = T3 * T9;
|
||||
T6 = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T6, T9, T5);
|
||||
iio[WS(vs, 1)] = FNMS(T6, T4, Ta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_2) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_2, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 8 FP multiplications,
|
||||
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 17 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T2, T4, T6, T7, T8, T9, Ta, Tc, Te, Tf, Tg;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T4 = T1 - T2;
|
||||
T6 = iio[0];
|
||||
T7 = iio[WS(rs, 1)];
|
||||
T8 = T6 - T7;
|
||||
T9 = rio[WS(vs, 1)];
|
||||
Ta = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tc = T9 - Ta;
|
||||
Te = iio[WS(vs, 1)];
|
||||
Tf = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tg = Te - Tf;
|
||||
rio[0] = T1 + T2;
|
||||
iio[0] = T6 + T7;
|
||||
rio[WS(rs, 1)] = T9 + Ta;
|
||||
iio[WS(rs, 1)] = Te + Tf;
|
||||
{
|
||||
E Tb, Td, T3, T5;
|
||||
Tb = W[0];
|
||||
Td = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tb, Tc, Td * Tg);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Td, Tc, Tb * Tg);
|
||||
T3 = W[0];
|
||||
T5 = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T3, T4, T5 * T8);
|
||||
iio[WS(vs, 1)] = FNMS(T5, T4, T3 * T8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_2) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_2, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,316 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 48 FP additions, 42 FP multiplications,
|
||||
* (or, 18 additions, 12 multiplications, 30 fused multiply/add),
|
||||
* 35 stack variables, 2 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T4, T6, Tg, Td, Te, T9, Tf, Tp, Ts, Tu, TE, TB, TC, Tx;
|
||||
E TD, TZ, T10, TV, T11, TN, TQ, TS, T12;
|
||||
{
|
||||
E T2, T3, Tv, Tw;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T6 = FNMS(KP500000000, T4, T1);
|
||||
Tg = T3 - T2;
|
||||
{
|
||||
E T7, T8, Tq, Tr;
|
||||
Td = iio[0];
|
||||
T7 = iio[WS(rs, 1)];
|
||||
T8 = iio[WS(rs, 2)];
|
||||
Te = T7 + T8;
|
||||
T9 = T7 - T8;
|
||||
Tf = FNMS(KP500000000, Te, Td);
|
||||
Tp = rio[WS(vs, 1)];
|
||||
Tq = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tr = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
Ts = Tq + Tr;
|
||||
Tu = FNMS(KP500000000, Ts, Tp);
|
||||
TE = Tr - Tq;
|
||||
}
|
||||
TB = iio[WS(vs, 1)];
|
||||
Tv = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tw = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
TC = Tv + Tw;
|
||||
Tx = Tv - Tw;
|
||||
TD = FNMS(KP500000000, TC, TB);
|
||||
{
|
||||
E TT, TU, TO, TP;
|
||||
TZ = iio[WS(vs, 2)];
|
||||
TT = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
TU = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T10 = TT + TU;
|
||||
TV = TT - TU;
|
||||
T11 = FNMS(KP500000000, T10, TZ);
|
||||
TN = rio[WS(vs, 2)];
|
||||
TO = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
TP = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
TQ = TO + TP;
|
||||
TS = FNMS(KP500000000, TQ, TN);
|
||||
T12 = TP - TO;
|
||||
}
|
||||
}
|
||||
rio[0] = T1 + T4;
|
||||
iio[0] = Td + Te;
|
||||
rio[WS(rs, 1)] = Tp + Ts;
|
||||
iio[WS(rs, 1)] = TB + TC;
|
||||
iio[WS(rs, 2)] = TZ + T10;
|
||||
rio[WS(rs, 2)] = TN + TQ;
|
||||
{
|
||||
E Ta, Th, Tb, Ti, T5, Tc;
|
||||
Ta = FMA(KP866025403, T9, T6);
|
||||
Th = FMA(KP866025403, Tg, Tf);
|
||||
T5 = W[0];
|
||||
Tb = T5 * Ta;
|
||||
Ti = T5 * Th;
|
||||
Tc = W[1];
|
||||
rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
|
||||
iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
|
||||
}
|
||||
{
|
||||
E T16, T19, T17, T1a, T15, T18;
|
||||
T16 = FNMS(KP866025403, TV, TS);
|
||||
T19 = FNMS(KP866025403, T12, T11);
|
||||
T15 = W[2];
|
||||
T17 = T15 * T16;
|
||||
T1a = T15 * T19;
|
||||
T18 = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
|
||||
}
|
||||
{
|
||||
E TI, TL, TJ, TM, TH, TK;
|
||||
TI = FNMS(KP866025403, Tx, Tu);
|
||||
TL = FNMS(KP866025403, TE, TD);
|
||||
TH = W[2];
|
||||
TJ = TH * TI;
|
||||
TM = TH * TL;
|
||||
TK = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
|
||||
}
|
||||
{
|
||||
E Ty, TF, Tz, TG, Tt, TA;
|
||||
Ty = FMA(KP866025403, Tx, Tu);
|
||||
TF = FMA(KP866025403, TE, TD);
|
||||
Tt = W[0];
|
||||
Tz = Tt * Ty;
|
||||
TG = Tt * TF;
|
||||
TA = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
|
||||
}
|
||||
{
|
||||
E TW, T13, TX, T14, TR, TY;
|
||||
TW = FMA(KP866025403, TV, TS);
|
||||
T13 = FMA(KP866025403, T12, T11);
|
||||
TR = W[0];
|
||||
TX = TR * TW;
|
||||
T14 = TR * T13;
|
||||
TY = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
|
||||
}
|
||||
{
|
||||
E Tk, Tn, Tl, To, Tj, Tm;
|
||||
Tk = FNMS(KP866025403, T9, T6);
|
||||
Tn = FNMS(KP866025403, Tg, Tf);
|
||||
Tj = W[2];
|
||||
Tl = Tj * Tk;
|
||||
To = Tj * Tn;
|
||||
Tm = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
|
||||
iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 18, 12, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_3) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_3, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 48 FP additions, 36 FP multiplications,
|
||||
* (or, 30 additions, 18 multiplications, 18 fused multiply/add),
|
||||
* 35 stack variables, 2 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
|
||||
E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
|
||||
{
|
||||
E T2, T3, Tr, Ts;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T6 = FNMS(KP500000000, T4, T1);
|
||||
Tc = KP866025403 * (T3 - T2);
|
||||
{
|
||||
E T7, T8, Tm, Tn;
|
||||
Td = iio[0];
|
||||
T7 = iio[WS(rs, 1)];
|
||||
T8 = iio[WS(rs, 2)];
|
||||
Te = T7 + T8;
|
||||
T9 = KP866025403 * (T7 - T8);
|
||||
Tf = FNMS(KP500000000, Te, Td);
|
||||
Tl = rio[WS(vs, 1)];
|
||||
Tm = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tn = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
To = Tm + Tn;
|
||||
Tq = FNMS(KP500000000, To, Tl);
|
||||
Tw = KP866025403 * (Tn - Tm);
|
||||
}
|
||||
Tx = iio[WS(vs, 1)];
|
||||
Tr = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Ts = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
Ty = Tr + Ts;
|
||||
Tt = KP866025403 * (Tr - Ts);
|
||||
Tz = FNMS(KP500000000, Ty, Tx);
|
||||
{
|
||||
E TL, TM, TG, TH;
|
||||
TR = iio[WS(vs, 2)];
|
||||
TL = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
TM = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
TS = TL + TM;
|
||||
TN = KP866025403 * (TL - TM);
|
||||
TT = FNMS(KP500000000, TS, TR);
|
||||
TF = rio[WS(vs, 2)];
|
||||
TG = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
TH = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
TI = TG + TH;
|
||||
TK = FNMS(KP500000000, TI, TF);
|
||||
TQ = KP866025403 * (TH - TG);
|
||||
}
|
||||
}
|
||||
rio[0] = T1 + T4;
|
||||
iio[0] = Td + Te;
|
||||
rio[WS(rs, 1)] = Tl + To;
|
||||
iio[WS(rs, 1)] = Tx + Ty;
|
||||
iio[WS(rs, 2)] = TR + TS;
|
||||
rio[WS(rs, 2)] = TF + TI;
|
||||
{
|
||||
E Ta, Tg, T5, Tb;
|
||||
Ta = T6 + T9;
|
||||
Tg = Tc + Tf;
|
||||
T5 = W[0];
|
||||
Tb = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
|
||||
iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
|
||||
}
|
||||
{
|
||||
E TW, TY, TV, TX;
|
||||
TW = TK - TN;
|
||||
TY = TT - TQ;
|
||||
TV = W[2];
|
||||
TX = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
|
||||
}
|
||||
{
|
||||
E TC, TE, TB, TD;
|
||||
TC = Tq - Tt;
|
||||
TE = Tz - Tw;
|
||||
TB = W[2];
|
||||
TD = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
|
||||
}
|
||||
{
|
||||
E Tu, TA, Tp, Tv;
|
||||
Tu = Tq + Tt;
|
||||
TA = Tw + Tz;
|
||||
Tp = W[0];
|
||||
Tv = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
|
||||
}
|
||||
{
|
||||
E TO, TU, TJ, TP;
|
||||
TO = TK + TN;
|
||||
TU = TQ + TT;
|
||||
TJ = W[0];
|
||||
TP = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
|
||||
}
|
||||
{
|
||||
E Ti, Tk, Th, Tj;
|
||||
Ti = T6 - T9;
|
||||
Tk = Tf - Tc;
|
||||
Th = W[2];
|
||||
Tj = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
|
||||
iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 30, 18, 18, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_3) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_3, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,524 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 88 FP additions, 48 FP multiplications,
|
||||
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
|
||||
* 51 stack variables, 0 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T3, Tv, Tw, T6, Tc, Tf, Tx, Ts, Tm, Ti, T1H, T29, T2a, T1K, T1Q;
|
||||
E T1T, T2b, T26, T20, T1W, TB, T13, T14, TE, TK, TN, T15, T10, TU, TQ;
|
||||
E T19, T1B, T1C, T1c, T1i, T1l, T1D, T1y, T1s, T1o;
|
||||
{
|
||||
E T1, T2, Tb, Tg, Th, T8;
|
||||
{
|
||||
E T9, Ta, T4, T5;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
T9 = iio[0];
|
||||
Ta = iio[WS(rs, 2)];
|
||||
Tb = T9 - Ta;
|
||||
Tv = T9 + Ta;
|
||||
Tg = iio[WS(rs, 1)];
|
||||
Th = iio[WS(rs, 3)];
|
||||
Tw = Tg + Th;
|
||||
T4 = rio[WS(rs, 1)];
|
||||
T5 = rio[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T8 = T4 - T5;
|
||||
}
|
||||
Tc = T8 + Tb;
|
||||
Tf = T1 - T2;
|
||||
Tx = Tv - Tw;
|
||||
Ts = T3 - T6;
|
||||
Tm = Tb - T8;
|
||||
Ti = Tg - Th;
|
||||
}
|
||||
{
|
||||
E T1F, T1G, T1P, T1U, T1V, T1M;
|
||||
{
|
||||
E T1N, T1O, T1I, T1J;
|
||||
T1F = rio[WS(vs, 3)];
|
||||
T1G = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1H = T1F + T1G;
|
||||
T1N = iio[WS(vs, 3)];
|
||||
T1O = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1P = T1N - T1O;
|
||||
T29 = T1N + T1O;
|
||||
T1U = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1V = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2a = T1U + T1V;
|
||||
T1I = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1J = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T1K = T1I + T1J;
|
||||
T1M = T1I - T1J;
|
||||
}
|
||||
T1Q = T1M + T1P;
|
||||
T1T = T1F - T1G;
|
||||
T2b = T29 - T2a;
|
||||
T26 = T1H - T1K;
|
||||
T20 = T1P - T1M;
|
||||
T1W = T1U - T1V;
|
||||
}
|
||||
{
|
||||
E Tz, TA, TJ, TO, TP, TG;
|
||||
{
|
||||
E TH, TI, TC, TD;
|
||||
Tz = rio[WS(vs, 1)];
|
||||
TA = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
TB = Tz + TA;
|
||||
TH = iio[WS(vs, 1)];
|
||||
TI = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
TJ = TH - TI;
|
||||
T13 = TH + TI;
|
||||
TO = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
TP = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
T14 = TO + TP;
|
||||
TC = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
TD = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
TE = TC + TD;
|
||||
TG = TC - TD;
|
||||
}
|
||||
TK = TG + TJ;
|
||||
TN = Tz - TA;
|
||||
T15 = T13 - T14;
|
||||
T10 = TB - TE;
|
||||
TU = TJ - TG;
|
||||
TQ = TO - TP;
|
||||
}
|
||||
{
|
||||
E T17, T18, T1h, T1m, T1n, T1e;
|
||||
{
|
||||
E T1f, T1g, T1a, T1b;
|
||||
T17 = rio[WS(vs, 2)];
|
||||
T18 = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
T19 = T17 + T18;
|
||||
T1f = iio[WS(vs, 2)];
|
||||
T1g = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1h = T1f - T1g;
|
||||
T1B = T1f + T1g;
|
||||
T1m = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1n = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1C = T1m + T1n;
|
||||
T1a = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1b = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1c = T1a + T1b;
|
||||
T1e = T1a - T1b;
|
||||
}
|
||||
T1i = T1e + T1h;
|
||||
T1l = T17 - T18;
|
||||
T1D = T1B - T1C;
|
||||
T1y = T19 - T1c;
|
||||
T1s = T1h - T1e;
|
||||
T1o = T1m - T1n;
|
||||
}
|
||||
rio[0] = T3 + T6;
|
||||
iio[0] = Tv + Tw;
|
||||
rio[WS(rs, 1)] = TB + TE;
|
||||
iio[WS(rs, 1)] = T13 + T14;
|
||||
rio[WS(rs, 2)] = T19 + T1c;
|
||||
iio[WS(rs, 2)] = T1B + T1C;
|
||||
iio[WS(rs, 3)] = T29 + T2a;
|
||||
rio[WS(rs, 3)] = T1H + T1K;
|
||||
{
|
||||
E Tt, Ty, Tr, Tu;
|
||||
Tr = W[2];
|
||||
Tt = Tr * Ts;
|
||||
Ty = Tr * Tx;
|
||||
Tu = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
|
||||
iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
|
||||
}
|
||||
{
|
||||
E T27, T2c, T25, T28;
|
||||
T25 = W[2];
|
||||
T27 = T25 * T26;
|
||||
T2c = T25 * T2b;
|
||||
T28 = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
|
||||
}
|
||||
{
|
||||
E T11, T16, TZ, T12;
|
||||
TZ = W[2];
|
||||
T11 = TZ * T10;
|
||||
T16 = TZ * T15;
|
||||
T12 = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
|
||||
}
|
||||
{
|
||||
E T1z, T1E, T1x, T1A;
|
||||
T1x = W[2];
|
||||
T1z = T1x * T1y;
|
||||
T1E = T1x * T1D;
|
||||
T1A = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
|
||||
}
|
||||
{
|
||||
E Tj, Te, Tk, T7, Td;
|
||||
Tj = Tf - Ti;
|
||||
Te = W[5];
|
||||
Tk = Te * Tc;
|
||||
T7 = W[4];
|
||||
Td = T7 * Tc;
|
||||
iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
|
||||
rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
|
||||
}
|
||||
{
|
||||
E T1p, T1k, T1q, T1d, T1j;
|
||||
T1p = T1l - T1o;
|
||||
T1k = W[5];
|
||||
T1q = T1k * T1i;
|
||||
T1d = W[4];
|
||||
T1j = T1d * T1i;
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
|
||||
}
|
||||
{
|
||||
E T23, T22, T24, T1Z, T21;
|
||||
T23 = T1T + T1W;
|
||||
T22 = W[1];
|
||||
T24 = T22 * T20;
|
||||
T1Z = W[0];
|
||||
T21 = T1Z * T20;
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
|
||||
}
|
||||
{
|
||||
E TX, TW, TY, TT, TV;
|
||||
TX = TN + TQ;
|
||||
TW = W[1];
|
||||
TY = TW * TU;
|
||||
TT = W[0];
|
||||
TV = TT * TU;
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
|
||||
}
|
||||
{
|
||||
E TR, TM, TS, TF, TL;
|
||||
TR = TN - TQ;
|
||||
TM = W[5];
|
||||
TS = TM * TK;
|
||||
TF = W[4];
|
||||
TL = TF * TK;
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
|
||||
}
|
||||
{
|
||||
E Tp, To, Tq, Tl, Tn;
|
||||
Tp = Tf + Ti;
|
||||
To = W[1];
|
||||
Tq = To * Tm;
|
||||
Tl = W[0];
|
||||
Tn = Tl * Tm;
|
||||
iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
|
||||
rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
|
||||
}
|
||||
{
|
||||
E T1v, T1u, T1w, T1r, T1t;
|
||||
T1v = T1l + T1o;
|
||||
T1u = W[1];
|
||||
T1w = T1u * T1s;
|
||||
T1r = W[0];
|
||||
T1t = T1r * T1s;
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
|
||||
}
|
||||
{
|
||||
E T1X, T1S, T1Y, T1L, T1R;
|
||||
T1X = T1T - T1W;
|
||||
T1S = W[5];
|
||||
T1Y = T1S * T1Q;
|
||||
T1L = W[4];
|
||||
T1R = T1L * T1Q;
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_4) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 88 FP additions, 48 FP multiplications,
|
||||
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
|
||||
* 37 stack variables, 0 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
|
||||
E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
|
||||
E T1D, T1N;
|
||||
{
|
||||
E T1, T2, T9, Ta;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
Te = T1 - T2;
|
||||
T9 = iio[0];
|
||||
Ta = iio[WS(rs, 2)];
|
||||
Tb = T9 - Ta;
|
||||
Tq = T9 + Ta;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tf, Tg;
|
||||
T4 = rio[WS(rs, 1)];
|
||||
T5 = rio[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T8 = T4 - T5;
|
||||
Tf = iio[WS(rs, 1)];
|
||||
Tg = iio[WS(rs, 3)];
|
||||
Th = Tf - Tg;
|
||||
Tr = Tf + Tg;
|
||||
}
|
||||
{
|
||||
E Tt, Tu, TB, TC;
|
||||
Tt = rio[WS(vs, 1)];
|
||||
Tu = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
Tv = Tt + Tu;
|
||||
TG = Tt - Tu;
|
||||
TB = iio[WS(vs, 1)];
|
||||
TC = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
TD = TB - TC;
|
||||
TS = TB + TC;
|
||||
}
|
||||
{
|
||||
E Tw, Tx, TH, TI;
|
||||
Tw = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tx = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
Ty = Tw + Tx;
|
||||
TA = Tw - Tx;
|
||||
TH = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
TI = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
TJ = TH - TI;
|
||||
TT = TH + TI;
|
||||
}
|
||||
{
|
||||
E TV, TW, T13, T14;
|
||||
TV = rio[WS(vs, 2)];
|
||||
TW = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
TX = TV + TW;
|
||||
T18 = TV - TW;
|
||||
T13 = iio[WS(vs, 2)];
|
||||
T14 = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T15 = T13 - T14;
|
||||
T1k = T13 + T14;
|
||||
}
|
||||
{
|
||||
E TY, TZ, T19, T1a;
|
||||
TY = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
TZ = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T10 = TY + TZ;
|
||||
T12 = TY - TZ;
|
||||
T19 = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1a = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1b = T19 - T1a;
|
||||
T1l = T19 + T1a;
|
||||
}
|
||||
{
|
||||
E T1n, T1o, T1v, T1w;
|
||||
T1n = rio[WS(vs, 3)];
|
||||
T1o = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1p = T1n + T1o;
|
||||
T1A = T1n - T1o;
|
||||
T1v = iio[WS(vs, 3)];
|
||||
T1w = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1x = T1v - T1w;
|
||||
T1M = T1v + T1w;
|
||||
}
|
||||
{
|
||||
E T1q, T1r, T1B, T1C;
|
||||
T1q = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1r = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T1s = T1q + T1r;
|
||||
T1u = T1q - T1r;
|
||||
T1B = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1C = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T1D = T1B - T1C;
|
||||
T1N = T1B + T1C;
|
||||
}
|
||||
rio[0] = T3 + T6;
|
||||
iio[0] = Tq + Tr;
|
||||
rio[WS(rs, 1)] = Tv + Ty;
|
||||
iio[WS(rs, 1)] = TS + TT;
|
||||
rio[WS(rs, 2)] = TX + T10;
|
||||
iio[WS(rs, 2)] = T1k + T1l;
|
||||
iio[WS(rs, 3)] = T1M + T1N;
|
||||
rio[WS(rs, 3)] = T1p + T1s;
|
||||
{
|
||||
E Tc, Ti, T7, Td;
|
||||
Tc = T8 + Tb;
|
||||
Ti = Te - Th;
|
||||
T7 = W[4];
|
||||
Td = W[5];
|
||||
iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
|
||||
rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
|
||||
}
|
||||
{
|
||||
E T1K, T1O, T1J, T1L;
|
||||
T1K = T1p - T1s;
|
||||
T1O = T1M - T1N;
|
||||
T1J = W[2];
|
||||
T1L = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = Tb - T8;
|
||||
Tm = Te + Th;
|
||||
Tj = W[0];
|
||||
Tl = W[1];
|
||||
iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
|
||||
rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E To, Ts, Tn, Tp;
|
||||
To = T3 - T6;
|
||||
Ts = Tq - Tr;
|
||||
Tn = W[2];
|
||||
Tp = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
|
||||
iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
|
||||
}
|
||||
{
|
||||
E T16, T1c, T11, T17;
|
||||
T16 = T12 + T15;
|
||||
T1c = T18 - T1b;
|
||||
T11 = W[4];
|
||||
T17 = W[5];
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
|
||||
}
|
||||
{
|
||||
E T1G, T1I, T1F, T1H;
|
||||
T1G = T1x - T1u;
|
||||
T1I = T1A + T1D;
|
||||
T1F = W[0];
|
||||
T1H = W[1];
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
|
||||
}
|
||||
{
|
||||
E TQ, TU, TP, TR;
|
||||
TQ = Tv - Ty;
|
||||
TU = TS - TT;
|
||||
TP = W[2];
|
||||
TR = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
|
||||
}
|
||||
{
|
||||
E T1e, T1g, T1d, T1f;
|
||||
T1e = T15 - T12;
|
||||
T1g = T18 + T1b;
|
||||
T1d = W[0];
|
||||
T1f = W[1];
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
|
||||
}
|
||||
{
|
||||
E T1i, T1m, T1h, T1j;
|
||||
T1i = TX - T10;
|
||||
T1m = T1k - T1l;
|
||||
T1h = W[2];
|
||||
T1j = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
|
||||
}
|
||||
{
|
||||
E T1y, T1E, T1t, T1z;
|
||||
T1y = T1u + T1x;
|
||||
T1E = T1A - T1D;
|
||||
T1t = W[4];
|
||||
T1z = W[5];
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = TD - TA;
|
||||
TO = TG + TJ;
|
||||
TL = W[0];
|
||||
TN = W[1];
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
|
||||
}
|
||||
{
|
||||
E TE, TK, Tz, TF;
|
||||
TE = TA + TD;
|
||||
TK = TG - TJ;
|
||||
Tz = W[4];
|
||||
TF = W[5];
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_4) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_4, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,992 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 200 FP additions, 170 FP multiplications,
|
||||
* (or, 70 additions, 40 multiplications, 130 fused multiply/add),
|
||||
* 75 stack variables, 4 constants, and 100 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr, TV, T15, T1G;
|
||||
E T1q, T12, T14, T1h, T1d, T1B, T1m, T1k, T1l, T1P, T1Z, T2A, T2k, T1W, T1Y;
|
||||
E T2b, T27, T2v, T2g, T2e, T2f, T3Z, T3V, T4j, T44, T42, T43, T3D, T3N, T4o;
|
||||
E T48, T3K, T3M, T2J, T2T, T3u, T3e, T2Q, T2S, T35, T31, T3p, T3a, T38, T39;
|
||||
{
|
||||
E T7, Tv, T4, Tu;
|
||||
T1 = rio[0];
|
||||
{
|
||||
E T5, T6, T2, T3;
|
||||
T5 = rio[WS(rs, 2)];
|
||||
T6 = rio[WS(rs, 3)];
|
||||
T7 = T5 + T6;
|
||||
Tv = T5 - T6;
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
Tu = T2 - T3;
|
||||
}
|
||||
Tb = T4 - T7;
|
||||
TM = FNMS(KP618033988, Tu, Tv);
|
||||
Tw = FMA(KP618033988, Tv, Tu);
|
||||
T8 = T4 + T7;
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Ti, Tp, Tf, To;
|
||||
Tn = iio[0];
|
||||
{
|
||||
E Tg, Th, Td, Te;
|
||||
Tg = iio[WS(rs, 2)];
|
||||
Th = iio[WS(rs, 3)];
|
||||
Ti = Tg - Th;
|
||||
Tp = Tg + Th;
|
||||
Td = iio[WS(rs, 1)];
|
||||
Te = iio[WS(rs, 4)];
|
||||
Tf = Td - Te;
|
||||
To = Td + Te;
|
||||
}
|
||||
Tj = FMA(KP618033988, Ti, Tf);
|
||||
TH = FNMS(KP618033988, Tf, Ti);
|
||||
Ts = To - Tp;
|
||||
Tq = To + Tp;
|
||||
Tr = FNMS(KP250000000, Tq, Tn);
|
||||
}
|
||||
{
|
||||
E T11, T1p, TY, T1o;
|
||||
TV = rio[WS(vs, 1)];
|
||||
{
|
||||
E TZ, T10, TW, TX;
|
||||
TZ = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
T10 = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
T11 = TZ + T10;
|
||||
T1p = TZ - T10;
|
||||
TW = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
TX = rio[WS(vs, 1) + WS(rs, 4)];
|
||||
TY = TW + TX;
|
||||
T1o = TW - TX;
|
||||
}
|
||||
T15 = TY - T11;
|
||||
T1G = FNMS(KP618033988, T1o, T1p);
|
||||
T1q = FMA(KP618033988, T1p, T1o);
|
||||
T12 = TY + T11;
|
||||
T14 = FNMS(KP250000000, T12, TV);
|
||||
}
|
||||
{
|
||||
E T1c, T1j, T19, T1i;
|
||||
T1h = iio[WS(vs, 1)];
|
||||
{
|
||||
E T1a, T1b, T17, T18;
|
||||
T1a = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
T1b = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
T1c = T1a - T1b;
|
||||
T1j = T1a + T1b;
|
||||
T17 = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
T18 = iio[WS(vs, 1) + WS(rs, 4)];
|
||||
T19 = T17 - T18;
|
||||
T1i = T17 + T18;
|
||||
}
|
||||
T1d = FMA(KP618033988, T1c, T19);
|
||||
T1B = FNMS(KP618033988, T19, T1c);
|
||||
T1m = T1i - T1j;
|
||||
T1k = T1i + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1h);
|
||||
}
|
||||
{
|
||||
E T1V, T2j, T1S, T2i;
|
||||
T1P = rio[WS(vs, 2)];
|
||||
{
|
||||
E T1T, T1U, T1Q, T1R;
|
||||
T1T = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1U = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1V = T1T + T1U;
|
||||
T2j = T1T - T1U;
|
||||
T1Q = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1R = rio[WS(vs, 2) + WS(rs, 4)];
|
||||
T1S = T1Q + T1R;
|
||||
T2i = T1Q - T1R;
|
||||
}
|
||||
T1Z = T1S - T1V;
|
||||
T2A = FNMS(KP618033988, T2i, T2j);
|
||||
T2k = FMA(KP618033988, T2j, T2i);
|
||||
T1W = T1S + T1V;
|
||||
T1Y = FNMS(KP250000000, T1W, T1P);
|
||||
}
|
||||
{
|
||||
E T26, T2d, T23, T2c;
|
||||
T2b = iio[WS(vs, 2)];
|
||||
{
|
||||
E T24, T25, T21, T22;
|
||||
T24 = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T25 = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T26 = T24 - T25;
|
||||
T2d = T24 + T25;
|
||||
T21 = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T22 = iio[WS(vs, 2) + WS(rs, 4)];
|
||||
T23 = T21 - T22;
|
||||
T2c = T21 + T22;
|
||||
}
|
||||
T27 = FMA(KP618033988, T26, T23);
|
||||
T2v = FNMS(KP618033988, T23, T26);
|
||||
T2g = T2c - T2d;
|
||||
T2e = T2c + T2d;
|
||||
T2f = FNMS(KP250000000, T2e, T2b);
|
||||
}
|
||||
{
|
||||
E T3U, T41, T3R, T40;
|
||||
T3Z = iio[WS(vs, 4)];
|
||||
{
|
||||
E T3S, T3T, T3P, T3Q;
|
||||
T3S = iio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3T = iio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3U = T3S - T3T;
|
||||
T41 = T3S + T3T;
|
||||
T3P = iio[WS(vs, 4) + WS(rs, 1)];
|
||||
T3Q = iio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3R = T3P - T3Q;
|
||||
T40 = T3P + T3Q;
|
||||
}
|
||||
T3V = FMA(KP618033988, T3U, T3R);
|
||||
T4j = FNMS(KP618033988, T3R, T3U);
|
||||
T44 = T40 - T41;
|
||||
T42 = T40 + T41;
|
||||
T43 = FNMS(KP250000000, T42, T3Z);
|
||||
}
|
||||
{
|
||||
E T3J, T47, T3G, T46;
|
||||
T3D = rio[WS(vs, 4)];
|
||||
{
|
||||
E T3H, T3I, T3E, T3F;
|
||||
T3H = rio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3I = rio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3J = T3H + T3I;
|
||||
T47 = T3H - T3I;
|
||||
T3E = rio[WS(vs, 4) + WS(rs, 1)];
|
||||
T3F = rio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3G = T3E + T3F;
|
||||
T46 = T3E - T3F;
|
||||
}
|
||||
T3N = T3G - T3J;
|
||||
T4o = FNMS(KP618033988, T46, T47);
|
||||
T48 = FMA(KP618033988, T47, T46);
|
||||
T3K = T3G + T3J;
|
||||
T3M = FNMS(KP250000000, T3K, T3D);
|
||||
}
|
||||
{
|
||||
E T2P, T3d, T2M, T3c;
|
||||
T2J = rio[WS(vs, 3)];
|
||||
{
|
||||
E T2N, T2O, T2K, T2L;
|
||||
T2N = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2O = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2P = T2N + T2O;
|
||||
T3d = T2N - T2O;
|
||||
T2K = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2L = rio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2M = T2K + T2L;
|
||||
T3c = T2K - T2L;
|
||||
}
|
||||
T2T = T2M - T2P;
|
||||
T3u = FNMS(KP618033988, T3c, T3d);
|
||||
T3e = FMA(KP618033988, T3d, T3c);
|
||||
T2Q = T2M + T2P;
|
||||
T2S = FNMS(KP250000000, T2Q, T2J);
|
||||
}
|
||||
{
|
||||
E T30, T37, T2X, T36;
|
||||
T35 = iio[WS(vs, 3)];
|
||||
{
|
||||
E T2Y, T2Z, T2V, T2W;
|
||||
T2Y = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2Z = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T30 = T2Y - T2Z;
|
||||
T37 = T2Y + T2Z;
|
||||
T2V = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2W = iio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2X = T2V - T2W;
|
||||
T36 = T2V + T2W;
|
||||
}
|
||||
T31 = FMA(KP618033988, T30, T2X);
|
||||
T3p = FNMS(KP618033988, T2X, T30);
|
||||
T3a = T36 - T37;
|
||||
T38 = T36 + T37;
|
||||
T39 = FNMS(KP250000000, T38, T35);
|
||||
}
|
||||
rio[0] = T1 + T8;
|
||||
iio[0] = Tn + Tq;
|
||||
rio[WS(rs, 1)] = TV + T12;
|
||||
iio[WS(rs, 1)] = T1h + T1k;
|
||||
rio[WS(rs, 2)] = T1P + T1W;
|
||||
iio[WS(rs, 2)] = T2b + T2e;
|
||||
iio[WS(rs, 4)] = T3Z + T42;
|
||||
rio[WS(rs, 4)] = T3D + T3K;
|
||||
rio[WS(rs, 3)] = T2J + T2Q;
|
||||
iio[WS(rs, 3)] = T35 + T38;
|
||||
{
|
||||
E Tk, TA, Tx, TD, Tc, Tt;
|
||||
Tc = FMA(KP559016994, Tb, Ta);
|
||||
Tk = FMA(KP951056516, Tj, Tc);
|
||||
TA = FNMS(KP951056516, Tj, Tc);
|
||||
Tt = FMA(KP559016994, Ts, Tr);
|
||||
Tx = FNMS(KP951056516, Tw, Tt);
|
||||
TD = FMA(KP951056516, Tw, Tt);
|
||||
{
|
||||
E Tl, Ty, T9, Tm;
|
||||
T9 = W[0];
|
||||
Tl = T9 * Tk;
|
||||
Ty = T9 * Tx;
|
||||
Tm = W[1];
|
||||
rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
|
||||
iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
|
||||
}
|
||||
{
|
||||
E TB, TE, Tz, TC;
|
||||
Tz = W[6];
|
||||
TB = Tz * TA;
|
||||
TE = Tz * TD;
|
||||
TC = W[7];
|
||||
rio[WS(vs, 4)] = FMA(TC, TD, TB);
|
||||
iio[WS(vs, 4)] = FNMS(TC, TA, TE);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TQ, TN, TT, TG, TL;
|
||||
TG = FNMS(KP559016994, Tb, Ta);
|
||||
TI = FNMS(KP951056516, TH, TG);
|
||||
TQ = FMA(KP951056516, TH, TG);
|
||||
TL = FNMS(KP559016994, Ts, Tr);
|
||||
TN = FMA(KP951056516, TM, TL);
|
||||
TT = FNMS(KP951056516, TM, TL);
|
||||
{
|
||||
E TJ, TO, TF, TK;
|
||||
TF = W[2];
|
||||
TJ = TF * TI;
|
||||
TO = TF * TN;
|
||||
TK = W[3];
|
||||
rio[WS(vs, 2)] = FMA(TK, TN, TJ);
|
||||
iio[WS(vs, 2)] = FNMS(TK, TI, TO);
|
||||
}
|
||||
{
|
||||
E TR, TU, TP, TS;
|
||||
TP = W[4];
|
||||
TR = TP * TQ;
|
||||
TU = TP * TT;
|
||||
TS = W[5];
|
||||
rio[WS(vs, 3)] = FMA(TS, TT, TR);
|
||||
iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2w, T2E, T2B, T2H, T2u, T2z;
|
||||
T2u = FNMS(KP559016994, T1Z, T1Y);
|
||||
T2w = FNMS(KP951056516, T2v, T2u);
|
||||
T2E = FMA(KP951056516, T2v, T2u);
|
||||
T2z = FNMS(KP559016994, T2g, T2f);
|
||||
T2B = FMA(KP951056516, T2A, T2z);
|
||||
T2H = FNMS(KP951056516, T2A, T2z);
|
||||
{
|
||||
E T2x, T2C, T2t, T2y;
|
||||
T2t = W[2];
|
||||
T2x = T2t * T2w;
|
||||
T2C = T2t * T2B;
|
||||
T2y = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
|
||||
}
|
||||
{
|
||||
E T2F, T2I, T2D, T2G;
|
||||
T2D = W[4];
|
||||
T2F = T2D * T2E;
|
||||
T2I = T2D * T2H;
|
||||
T2G = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T4k, T4s, T4p, T4v, T4i, T4n;
|
||||
T4i = FNMS(KP559016994, T3N, T3M);
|
||||
T4k = FNMS(KP951056516, T4j, T4i);
|
||||
T4s = FMA(KP951056516, T4j, T4i);
|
||||
T4n = FNMS(KP559016994, T44, T43);
|
||||
T4p = FMA(KP951056516, T4o, T4n);
|
||||
T4v = FNMS(KP951056516, T4o, T4n);
|
||||
{
|
||||
E T4l, T4q, T4h, T4m;
|
||||
T4h = W[2];
|
||||
T4l = T4h * T4k;
|
||||
T4q = T4h * T4p;
|
||||
T4m = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
|
||||
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
|
||||
}
|
||||
{
|
||||
E T4t, T4w, T4r, T4u;
|
||||
T4r = W[4];
|
||||
T4t = T4r * T4s;
|
||||
T4w = T4r * T4v;
|
||||
T4u = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
|
||||
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T28, T2o, T2l, T2r, T20, T2h;
|
||||
T20 = FMA(KP559016994, T1Z, T1Y);
|
||||
T28 = FMA(KP951056516, T27, T20);
|
||||
T2o = FNMS(KP951056516, T27, T20);
|
||||
T2h = FMA(KP559016994, T2g, T2f);
|
||||
T2l = FNMS(KP951056516, T2k, T2h);
|
||||
T2r = FMA(KP951056516, T2k, T2h);
|
||||
{
|
||||
E T29, T2m, T1X, T2a;
|
||||
T1X = W[0];
|
||||
T29 = T1X * T28;
|
||||
T2m = T1X * T2l;
|
||||
T2a = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
|
||||
}
|
||||
{
|
||||
E T2p, T2s, T2n, T2q;
|
||||
T2n = W[6];
|
||||
T2p = T2n * T2o;
|
||||
T2s = T2n * T2r;
|
||||
T2q = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
|
||||
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T32, T3i, T3f, T3l, T2U, T3b;
|
||||
T2U = FMA(KP559016994, T2T, T2S);
|
||||
T32 = FMA(KP951056516, T31, T2U);
|
||||
T3i = FNMS(KP951056516, T31, T2U);
|
||||
T3b = FMA(KP559016994, T3a, T39);
|
||||
T3f = FNMS(KP951056516, T3e, T3b);
|
||||
T3l = FMA(KP951056516, T3e, T3b);
|
||||
{
|
||||
E T33, T3g, T2R, T34;
|
||||
T2R = W[0];
|
||||
T33 = T2R * T32;
|
||||
T3g = T2R * T3f;
|
||||
T34 = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
|
||||
}
|
||||
{
|
||||
E T3j, T3m, T3h, T3k;
|
||||
T3h = W[6];
|
||||
T3j = T3h * T3i;
|
||||
T3m = T3h * T3l;
|
||||
T3k = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
|
||||
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3q, T3y, T3v, T3B, T3o, T3t;
|
||||
T3o = FNMS(KP559016994, T2T, T2S);
|
||||
T3q = FNMS(KP951056516, T3p, T3o);
|
||||
T3y = FMA(KP951056516, T3p, T3o);
|
||||
T3t = FNMS(KP559016994, T3a, T39);
|
||||
T3v = FMA(KP951056516, T3u, T3t);
|
||||
T3B = FNMS(KP951056516, T3u, T3t);
|
||||
{
|
||||
E T3r, T3w, T3n, T3s;
|
||||
T3n = W[2];
|
||||
T3r = T3n * T3q;
|
||||
T3w = T3n * T3v;
|
||||
T3s = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
|
||||
}
|
||||
{
|
||||
E T3z, T3C, T3x, T3A;
|
||||
T3x = W[4];
|
||||
T3z = T3x * T3y;
|
||||
T3C = T3x * T3B;
|
||||
T3A = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3W, T4c, T49, T4f, T3O, T45;
|
||||
T3O = FMA(KP559016994, T3N, T3M);
|
||||
T3W = FMA(KP951056516, T3V, T3O);
|
||||
T4c = FNMS(KP951056516, T3V, T3O);
|
||||
T45 = FMA(KP559016994, T44, T43);
|
||||
T49 = FNMS(KP951056516, T48, T45);
|
||||
T4f = FMA(KP951056516, T48, T45);
|
||||
{
|
||||
E T3X, T4a, T3L, T3Y;
|
||||
T3L = W[0];
|
||||
T3X = T3L * T3W;
|
||||
T4a = T3L * T49;
|
||||
T3Y = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
|
||||
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
|
||||
}
|
||||
{
|
||||
E T4d, T4g, T4b, T4e;
|
||||
T4b = W[6];
|
||||
T4d = T4b * T4c;
|
||||
T4g = T4b * T4f;
|
||||
T4e = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
|
||||
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1C, T1K, T1H, T1N, T1A, T1F;
|
||||
T1A = FNMS(KP559016994, T15, T14);
|
||||
T1C = FNMS(KP951056516, T1B, T1A);
|
||||
T1K = FMA(KP951056516, T1B, T1A);
|
||||
T1F = FNMS(KP559016994, T1m, T1l);
|
||||
T1H = FMA(KP951056516, T1G, T1F);
|
||||
T1N = FNMS(KP951056516, T1G, T1F);
|
||||
{
|
||||
E T1D, T1I, T1z, T1E;
|
||||
T1z = W[2];
|
||||
T1D = T1z * T1C;
|
||||
T1I = T1z * T1H;
|
||||
T1E = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
|
||||
}
|
||||
{
|
||||
E T1L, T1O, T1J, T1M;
|
||||
T1J = W[4];
|
||||
T1L = T1J * T1K;
|
||||
T1O = T1J * T1N;
|
||||
T1M = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1e, T1u, T1r, T1x, T16, T1n;
|
||||
T16 = FMA(KP559016994, T15, T14);
|
||||
T1e = FMA(KP951056516, T1d, T16);
|
||||
T1u = FNMS(KP951056516, T1d, T16);
|
||||
T1n = FMA(KP559016994, T1m, T1l);
|
||||
T1r = FNMS(KP951056516, T1q, T1n);
|
||||
T1x = FMA(KP951056516, T1q, T1n);
|
||||
{
|
||||
E T1f, T1s, T13, T1g;
|
||||
T13 = W[0];
|
||||
T1f = T13 * T1e;
|
||||
T1s = T13 * T1r;
|
||||
T1g = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
|
||||
}
|
||||
{
|
||||
E T1v, T1y, T1t, T1w;
|
||||
T1t = W[6];
|
||||
T1v = T1t * T1u;
|
||||
T1y = T1t * T1x;
|
||||
T1w = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
|
||||
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 70, 40, 130, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_5) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 200 FP additions, 140 FP multiplications,
|
||||
* (or, 130 additions, 70 multiplications, 70 fused multiply/add),
|
||||
* 75 stack variables, 4 constants, and 100 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
|
||||
E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
|
||||
E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
|
||||
E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
|
||||
{
|
||||
E T7, Tu, T4, Tt;
|
||||
T1 = rio[0];
|
||||
{
|
||||
E T5, T6, T2, T3;
|
||||
T5 = rio[WS(rs, 2)];
|
||||
T6 = rio[WS(rs, 3)];
|
||||
T7 = T5 + T6;
|
||||
Tu = T5 - T6;
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
Tt = T2 - T3;
|
||||
}
|
||||
Ta = KP559016994 * (T4 - T7);
|
||||
TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
|
||||
Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
|
||||
T8 = T4 + T7;
|
||||
Tb = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Ti, Tn, Tf, Tm;
|
||||
Tp = iio[0];
|
||||
{
|
||||
E Tg, Th, Td, Te;
|
||||
Tg = iio[WS(rs, 2)];
|
||||
Th = iio[WS(rs, 3)];
|
||||
Ti = Tg - Th;
|
||||
Tn = Tg + Th;
|
||||
Td = iio[WS(rs, 1)];
|
||||
Te = iio[WS(rs, 4)];
|
||||
Tf = Td - Te;
|
||||
Tm = Td + Te;
|
||||
}
|
||||
Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
|
||||
TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
|
||||
To = KP559016994 * (Tm - Tn);
|
||||
Tq = Tm + Tn;
|
||||
Tr = FNMS(KP250000000, Tq, Tp);
|
||||
}
|
||||
{
|
||||
E TT, T1g, TQ, T1f;
|
||||
TN = rio[WS(vs, 1)];
|
||||
{
|
||||
E TR, TS, TO, TP;
|
||||
TR = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
TS = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
TT = TR + TS;
|
||||
T1g = TR - TS;
|
||||
TO = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
TP = rio[WS(vs, 1) + WS(rs, 4)];
|
||||
TQ = TO + TP;
|
||||
T1f = TO - TP;
|
||||
}
|
||||
TW = KP559016994 * (TQ - TT);
|
||||
T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
|
||||
T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
|
||||
TU = TQ + TT;
|
||||
TX = FNMS(KP250000000, TU, TN);
|
||||
}
|
||||
{
|
||||
E T14, T19, T11, T18;
|
||||
T1b = iio[WS(vs, 1)];
|
||||
{
|
||||
E T12, T13, TZ, T10;
|
||||
T12 = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
T13 = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
T14 = T12 - T13;
|
||||
T19 = T12 + T13;
|
||||
TZ = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
T10 = iio[WS(vs, 1) + WS(rs, 4)];
|
||||
T11 = TZ - T10;
|
||||
T18 = TZ + T10;
|
||||
}
|
||||
T15 = FMA(KP951056516, T11, KP587785252 * T14);
|
||||
T1p = FNMS(KP587785252, T11, KP951056516 * T14);
|
||||
T1a = KP559016994 * (T18 - T19);
|
||||
T1c = T18 + T19;
|
||||
T1d = FNMS(KP250000000, T1c, T1b);
|
||||
}
|
||||
{
|
||||
E T1F, T22, T1C, T21;
|
||||
T1z = rio[WS(vs, 2)];
|
||||
{
|
||||
E T1D, T1E, T1A, T1B;
|
||||
T1D = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1E = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1F = T1D + T1E;
|
||||
T22 = T1D - T1E;
|
||||
T1A = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1B = rio[WS(vs, 2) + WS(rs, 4)];
|
||||
T1C = T1A + T1B;
|
||||
T21 = T1A - T1B;
|
||||
}
|
||||
T1I = KP559016994 * (T1C - T1F);
|
||||
T2e = FNMS(KP587785252, T21, KP951056516 * T22);
|
||||
T23 = FMA(KP951056516, T21, KP587785252 * T22);
|
||||
T1G = T1C + T1F;
|
||||
T1J = FNMS(KP250000000, T1G, T1z);
|
||||
}
|
||||
{
|
||||
E T1Q, T1V, T1N, T1U;
|
||||
T1X = iio[WS(vs, 2)];
|
||||
{
|
||||
E T1O, T1P, T1L, T1M;
|
||||
T1O = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1P = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1Q = T1O - T1P;
|
||||
T1V = T1O + T1P;
|
||||
T1L = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1M = iio[WS(vs, 2) + WS(rs, 4)];
|
||||
T1N = T1L - T1M;
|
||||
T1U = T1L + T1M;
|
||||
}
|
||||
T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
|
||||
T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
|
||||
T1W = KP559016994 * (T1U - T1V);
|
||||
T1Y = T1U + T1V;
|
||||
T1Z = FNMS(KP250000000, T1Y, T1X);
|
||||
}
|
||||
{
|
||||
E T3o, T3t, T3l, T3s;
|
||||
T3v = iio[WS(vs, 4)];
|
||||
{
|
||||
E T3m, T3n, T3j, T3k;
|
||||
T3m = iio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3n = iio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3o = T3m - T3n;
|
||||
T3t = T3m + T3n;
|
||||
T3j = iio[WS(vs, 4) + WS(rs, 1)];
|
||||
T3k = iio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3l = T3j - T3k;
|
||||
T3s = T3j + T3k;
|
||||
}
|
||||
T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
|
||||
T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
|
||||
T3u = KP559016994 * (T3s - T3t);
|
||||
T3w = T3s + T3t;
|
||||
T3x = FNMS(KP250000000, T3w, T3v);
|
||||
}
|
||||
{
|
||||
E T3d, T3A, T3a, T3z;
|
||||
T37 = rio[WS(vs, 4)];
|
||||
{
|
||||
E T3b, T3c, T38, T39;
|
||||
T3b = rio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3c = rio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3d = T3b + T3c;
|
||||
T3A = T3b - T3c;
|
||||
T38 = rio[WS(vs, 4) + WS(rs, 1)];
|
||||
T39 = rio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3a = T38 + T39;
|
||||
T3z = T38 - T39;
|
||||
}
|
||||
T3g = KP559016994 * (T3a - T3d);
|
||||
T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
|
||||
T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
|
||||
T3e = T3a + T3d;
|
||||
T3h = FNMS(KP250000000, T3e, T37);
|
||||
}
|
||||
{
|
||||
E T2r, T2O, T2o, T2N;
|
||||
T2l = rio[WS(vs, 3)];
|
||||
{
|
||||
E T2p, T2q, T2m, T2n;
|
||||
T2p = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2q = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2r = T2p + T2q;
|
||||
T2O = T2p - T2q;
|
||||
T2m = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2n = rio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2o = T2m + T2n;
|
||||
T2N = T2m - T2n;
|
||||
}
|
||||
T2u = KP559016994 * (T2o - T2r);
|
||||
T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
|
||||
T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
|
||||
T2s = T2o + T2r;
|
||||
T2v = FNMS(KP250000000, T2s, T2l);
|
||||
}
|
||||
{
|
||||
E T2C, T2H, T2z, T2G;
|
||||
T2J = iio[WS(vs, 3)];
|
||||
{
|
||||
E T2A, T2B, T2x, T2y;
|
||||
T2A = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2B = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2C = T2A - T2B;
|
||||
T2H = T2A + T2B;
|
||||
T2x = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2y = iio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2z = T2x - T2y;
|
||||
T2G = T2x + T2y;
|
||||
}
|
||||
T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
|
||||
T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
|
||||
T2I = KP559016994 * (T2G - T2H);
|
||||
T2K = T2G + T2H;
|
||||
T2L = FNMS(KP250000000, T2K, T2J);
|
||||
}
|
||||
rio[0] = T1 + T8;
|
||||
iio[0] = Tp + Tq;
|
||||
rio[WS(rs, 1)] = TN + TU;
|
||||
iio[WS(rs, 1)] = T1b + T1c;
|
||||
rio[WS(rs, 2)] = T1z + T1G;
|
||||
iio[WS(rs, 2)] = T1X + T1Y;
|
||||
iio[WS(rs, 4)] = T3v + T3w;
|
||||
rio[WS(rs, 4)] = T37 + T3e;
|
||||
rio[WS(rs, 3)] = T2l + T2s;
|
||||
iio[WS(rs, 3)] = T2J + T2K;
|
||||
{
|
||||
E Tk, Ty, Tw, TA, Tc, Ts;
|
||||
Tc = Ta + Tb;
|
||||
Tk = Tc + Tj;
|
||||
Ty = Tc - Tj;
|
||||
Ts = To + Tr;
|
||||
Tw = Ts - Tv;
|
||||
TA = Tv + Ts;
|
||||
{
|
||||
E T9, Tl, Tx, Tz;
|
||||
T9 = W[0];
|
||||
Tl = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
|
||||
iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
|
||||
Tx = W[6];
|
||||
Tz = W[7];
|
||||
rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
|
||||
iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TE, TK, TI, TM, TC, TH;
|
||||
TC = Tb - Ta;
|
||||
TE = TC - TD;
|
||||
TK = TC + TD;
|
||||
TH = Tr - To;
|
||||
TI = TG + TH;
|
||||
TM = TH - TG;
|
||||
{
|
||||
E TB, TF, TJ, TL;
|
||||
TB = W[2];
|
||||
TF = W[3];
|
||||
rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
|
||||
iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
|
||||
TJ = W[4];
|
||||
TL = W[5];
|
||||
rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
|
||||
iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2c, T2i, T2g, T2k, T2a, T2f;
|
||||
T2a = T1J - T1I;
|
||||
T2c = T2a - T2b;
|
||||
T2i = T2a + T2b;
|
||||
T2f = T1Z - T1W;
|
||||
T2g = T2e + T2f;
|
||||
T2k = T2f - T2e;
|
||||
{
|
||||
E T29, T2d, T2h, T2j;
|
||||
T29 = W[2];
|
||||
T2d = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
|
||||
T2h = W[4];
|
||||
T2j = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3K, T3Q, T3O, T3S, T3I, T3N;
|
||||
T3I = T3h - T3g;
|
||||
T3K = T3I - T3J;
|
||||
T3Q = T3I + T3J;
|
||||
T3N = T3x - T3u;
|
||||
T3O = T3M + T3N;
|
||||
T3S = T3N - T3M;
|
||||
{
|
||||
E T3H, T3L, T3P, T3R;
|
||||
T3H = W[2];
|
||||
T3L = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
|
||||
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
|
||||
T3P = W[4];
|
||||
T3R = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
|
||||
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1S, T26, T24, T28, T1K, T20;
|
||||
T1K = T1I + T1J;
|
||||
T1S = T1K + T1R;
|
||||
T26 = T1K - T1R;
|
||||
T20 = T1W + T1Z;
|
||||
T24 = T20 - T23;
|
||||
T28 = T23 + T20;
|
||||
{
|
||||
E T1H, T1T, T25, T27;
|
||||
T1H = W[0];
|
||||
T1T = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
|
||||
T25 = W[6];
|
||||
T27 = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
|
||||
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2E, T2S, T2Q, T2U, T2w, T2M;
|
||||
T2w = T2u + T2v;
|
||||
T2E = T2w + T2D;
|
||||
T2S = T2w - T2D;
|
||||
T2M = T2I + T2L;
|
||||
T2Q = T2M - T2P;
|
||||
T2U = T2P + T2M;
|
||||
{
|
||||
E T2t, T2F, T2R, T2T;
|
||||
T2t = W[0];
|
||||
T2F = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
|
||||
T2R = W[6];
|
||||
T2T = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
|
||||
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2Y, T34, T32, T36, T2W, T31;
|
||||
T2W = T2v - T2u;
|
||||
T2Y = T2W - T2X;
|
||||
T34 = T2W + T2X;
|
||||
T31 = T2L - T2I;
|
||||
T32 = T30 + T31;
|
||||
T36 = T31 - T30;
|
||||
{
|
||||
E T2V, T2Z, T33, T35;
|
||||
T2V = W[2];
|
||||
T2Z = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
|
||||
T33 = W[4];
|
||||
T35 = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3q, T3E, T3C, T3G, T3i, T3y;
|
||||
T3i = T3g + T3h;
|
||||
T3q = T3i + T3p;
|
||||
T3E = T3i - T3p;
|
||||
T3y = T3u + T3x;
|
||||
T3C = T3y - T3B;
|
||||
T3G = T3B + T3y;
|
||||
{
|
||||
E T3f, T3r, T3D, T3F;
|
||||
T3f = W[0];
|
||||
T3r = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
|
||||
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
|
||||
T3D = W[6];
|
||||
T3F = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
|
||||
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1q, T1w, T1u, T1y, T1o, T1t;
|
||||
T1o = TX - TW;
|
||||
T1q = T1o - T1p;
|
||||
T1w = T1o + T1p;
|
||||
T1t = T1d - T1a;
|
||||
T1u = T1s + T1t;
|
||||
T1y = T1t - T1s;
|
||||
{
|
||||
E T1n, T1r, T1v, T1x;
|
||||
T1n = W[2];
|
||||
T1r = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
|
||||
T1v = W[4];
|
||||
T1x = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T16, T1k, T1i, T1m, TY, T1e;
|
||||
TY = TW + TX;
|
||||
T16 = TY + T15;
|
||||
T1k = TY - T15;
|
||||
T1e = T1a + T1d;
|
||||
T1i = T1e - T1h;
|
||||
T1m = T1h + T1e;
|
||||
{
|
||||
E TV, T17, T1j, T1l;
|
||||
TV = W[0];
|
||||
T17 = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
|
||||
T1j = W[6];
|
||||
T1l = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
|
||||
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 130, 70, 70, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_5) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_5, &desc);
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,489 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 72 FP multiplications,
|
||||
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 47 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T1P, T16, T17, T18, T1s, T1x;
|
||||
E T25, Tl, Ty, Tz, T1I, T1J, T1O, T13, T14, T15, T1h, T1m, T24;
|
||||
{
|
||||
E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5;
|
||||
T1 = ri[0];
|
||||
T1T = ii[0];
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T6 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1R = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1S = FNMS(T5, T3, T1R);
|
||||
T8 = T1 - T7;
|
||||
T23 = T1T - T1S;
|
||||
T12 = T1 + T7;
|
||||
T1U = T1S + T1T;
|
||||
}
|
||||
{
|
||||
E TF, T1p, TY, T1w, TL, T1r, TS, T1u;
|
||||
{
|
||||
E TB, TE, TC, T1o, TA, TD;
|
||||
TB = ri[WS(rs, 4)];
|
||||
TE = ii[WS(rs, 4)];
|
||||
TA = W[6];
|
||||
TC = TA * TB;
|
||||
T1o = TA * TE;
|
||||
TD = W[7];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1p = FNMS(TD, TB, T1o);
|
||||
}
|
||||
{
|
||||
E TU, TX, TV, T1v, TT, TW;
|
||||
TU = ri[WS(rs, 1)];
|
||||
TX = ii[WS(rs, 1)];
|
||||
TT = W[0];
|
||||
TV = TT * TU;
|
||||
T1v = TT * TX;
|
||||
TW = W[1];
|
||||
TY = FMA(TW, TX, TV);
|
||||
T1w = FNMS(TW, TU, T1v);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1q, TG, TJ;
|
||||
TH = ri[WS(rs, 9)];
|
||||
TK = ii[WS(rs, 9)];
|
||||
TG = W[16];
|
||||
TI = TG * TH;
|
||||
T1q = TG * TK;
|
||||
TJ = W[17];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1r = FNMS(TJ, TH, T1q);
|
||||
}
|
||||
{
|
||||
E TO, TR, TP, T1t, TN, TQ;
|
||||
TO = ri[WS(rs, 6)];
|
||||
TR = ii[WS(rs, 6)];
|
||||
TN = W[10];
|
||||
TP = TN * TO;
|
||||
T1t = TN * TR;
|
||||
TQ = W[11];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1u = FNMS(TQ, TO, T1t);
|
||||
}
|
||||
TM = TF - TL;
|
||||
TZ = TS - TY;
|
||||
T10 = TM + TZ;
|
||||
T1F = T1p + T1r;
|
||||
T1G = T1u + T1w;
|
||||
T1P = T1F + T1G;
|
||||
T16 = TF + TL;
|
||||
T17 = TS + TY;
|
||||
T18 = T16 + T17;
|
||||
T1s = T1p - T1r;
|
||||
T1x = T1u - T1w;
|
||||
T25 = T1s + T1x;
|
||||
}
|
||||
{
|
||||
E Te, T1e, Tx, T1l, Tk, T1g, Tr, T1j;
|
||||
{
|
||||
E Ta, Td, Tb, T1d, T9, Tc;
|
||||
Ta = ri[WS(rs, 2)];
|
||||
Td = ii[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
T1d = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1e = FNMS(Tc, Ta, T1d);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1k, Ts, Tv;
|
||||
Tt = ri[WS(rs, 3)];
|
||||
Tw = ii[WS(rs, 3)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
T1k = Ts * Tw;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1l = FNMS(Tv, Tt, T1k);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, T1f, Tf, Ti;
|
||||
Tg = ri[WS(rs, 7)];
|
||||
Tj = ii[WS(rs, 7)];
|
||||
Tf = W[12];
|
||||
Th = Tf * Tg;
|
||||
T1f = Tf * Tj;
|
||||
Ti = W[13];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1g = FNMS(Ti, Tg, T1f);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1i, Tm, Tp;
|
||||
Tn = ri[WS(rs, 8)];
|
||||
Tq = ii[WS(rs, 8)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1i = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1j = FNMS(Tp, Tn, T1i);
|
||||
}
|
||||
Tl = Te - Tk;
|
||||
Ty = Tr - Tx;
|
||||
Tz = Tl + Ty;
|
||||
T1I = T1e + T1g;
|
||||
T1J = T1j + T1l;
|
||||
T1O = T1I + T1J;
|
||||
T13 = Te + Tk;
|
||||
T14 = Tr + Tx;
|
||||
T15 = T13 + T14;
|
||||
T1h = T1e - T1g;
|
||||
T1m = T1j - T1l;
|
||||
T24 = T1h + T1m;
|
||||
}
|
||||
{
|
||||
E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
|
||||
T1b = Tz - T10;
|
||||
T11 = Tz + T10;
|
||||
T1a = FNMS(KP250000000, T11, T8);
|
||||
T1n = T1h - T1m;
|
||||
T1y = T1s - T1x;
|
||||
T1z = FMA(KP618033988, T1y, T1n);
|
||||
T1B = FNMS(KP618033988, T1n, T1y);
|
||||
ri[WS(rs, 5)] = T8 + T11;
|
||||
T1A = FNMS(KP559016994, T1b, T1a);
|
||||
ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
|
||||
T1c = FMA(KP559016994, T1b, T1a);
|
||||
ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
|
||||
}
|
||||
{
|
||||
E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29;
|
||||
T28 = T24 - T25;
|
||||
T26 = T24 + T25;
|
||||
T27 = FNMS(KP250000000, T26, T23);
|
||||
T2a = Tl - Ty;
|
||||
T2b = TM - TZ;
|
||||
T2c = FMA(KP618033988, T2b, T2a);
|
||||
T2e = FNMS(KP618033988, T2a, T2b);
|
||||
ii[WS(rs, 5)] = T26 + T23;
|
||||
T2d = FNMS(KP559016994, T28, T27);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d);
|
||||
ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
|
||||
T29 = FMA(KP559016994, T28, T27);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29);
|
||||
ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29);
|
||||
}
|
||||
{
|
||||
E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
|
||||
T1D = T15 - T18;
|
||||
T19 = T15 + T18;
|
||||
T1C = FNMS(KP250000000, T19, T12);
|
||||
T1H = T1F - T1G;
|
||||
T1K = T1I - T1J;
|
||||
T1L = FNMS(KP618033988, T1K, T1H);
|
||||
T1N = FMA(KP618033988, T1H, T1K);
|
||||
ri[0] = T12 + T19;
|
||||
T1M = FMA(KP559016994, T1D, T1C);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
|
||||
ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M);
|
||||
T1E = FNMS(KP559016994, T1D, T1C);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
|
||||
ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E);
|
||||
}
|
||||
{
|
||||
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
|
||||
T1W = T1O - T1P;
|
||||
T1Q = T1O + T1P;
|
||||
T1V = FNMS(KP250000000, T1Q, T1U);
|
||||
T1Y = T16 - T17;
|
||||
T1Z = T13 - T14;
|
||||
T20 = FNMS(KP618033988, T1Z, T1Y);
|
||||
T22 = FMA(KP618033988, T1Y, T1Z);
|
||||
ii[0] = T1Q + T1U;
|
||||
T21 = FMA(KP559016994, T1W, T1V);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, T22, T21);
|
||||
ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21);
|
||||
T1X = FNMS(KP559016994, T1W, T1V);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
|
||||
ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 48, 18, 54, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_10, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 60 FP multiplications,
|
||||
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 45 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g;
|
||||
E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L;
|
||||
{
|
||||
E T1, T1B, T6, T1A;
|
||||
T1 = ri[0];
|
||||
T1B = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T5 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1A = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
T1O = T1B - T1A;
|
||||
TT = T1 + T6;
|
||||
T1C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tz, T1b, TP, T1f, TE, T1c, TK, T1e;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 4)];
|
||||
Ty = ii[WS(rs, 4)];
|
||||
Tv = W[6];
|
||||
Tx = W[7];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1b = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = ri[WS(rs, 1)];
|
||||
TO = ii[WS(rs, 1)];
|
||||
TL = W[0];
|
||||
TN = W[1];
|
||||
TP = FMA(TL, TM, TN * TO);
|
||||
T1f = FNMS(TN, TM, TL * TO);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 9)];
|
||||
TD = ii[WS(rs, 9)];
|
||||
TA = W[16];
|
||||
TC = W[17];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1c = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = ri[WS(rs, 6)];
|
||||
TJ = ii[WS(rs, 6)];
|
||||
TG = W[10];
|
||||
TI = W[11];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1e = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
TF = Tz - TE;
|
||||
TQ = TK - TP;
|
||||
TR = TF + TQ;
|
||||
T1o = T1b + T1c;
|
||||
T1p = T1e + T1f;
|
||||
T1y = T1o + T1p;
|
||||
TX = Tz + TE;
|
||||
TY = TK + TP;
|
||||
TZ = TX + TY;
|
||||
T1d = T1b - T1c;
|
||||
T1g = T1e - T1f;
|
||||
T1M = T1d + T1g;
|
||||
}
|
||||
{
|
||||
E Tc, T14, Ts, T18, Th, T15, Tn, T17;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tb = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T14 = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 3)];
|
||||
Tr = ii[WS(rs, 3)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T18 = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 7)];
|
||||
Tg = ii[WS(rs, 7)];
|
||||
Td = W[12];
|
||||
Tf = W[13];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T15 = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 8)];
|
||||
Tm = ii[WS(rs, 8)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T17 = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
Tt = Tn - Ts;
|
||||
Tu = Ti + Tt;
|
||||
T1r = T14 + T15;
|
||||
T1s = T17 + T18;
|
||||
T1x = T1r + T1s;
|
||||
TU = Tc + Th;
|
||||
TV = Tn + Ts;
|
||||
TW = TU + TV;
|
||||
T16 = T14 - T15;
|
||||
T19 = T17 - T18;
|
||||
T1L = T16 + T19;
|
||||
}
|
||||
{
|
||||
E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
|
||||
T11 = KP559016994 * (Tu - TR);
|
||||
TS = Tu + TR;
|
||||
T12 = FNMS(KP250000000, TS, T7);
|
||||
T1a = T16 - T19;
|
||||
T1h = T1d - T1g;
|
||||
T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
|
||||
T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
|
||||
ri[WS(rs, 5)] = T7 + TS;
|
||||
T1j = T12 - T11;
|
||||
ri[WS(rs, 7)] = T1j - T1k;
|
||||
ri[WS(rs, 3)] = T1j + T1k;
|
||||
T13 = T11 + T12;
|
||||
ri[WS(rs, 9)] = T13 - T1i;
|
||||
ri[WS(rs, 1)] = T13 + T1i;
|
||||
}
|
||||
{
|
||||
E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R;
|
||||
T1N = KP559016994 * (T1L - T1M);
|
||||
T1P = T1L + T1M;
|
||||
T1Q = FNMS(KP250000000, T1P, T1O);
|
||||
T1S = Ti - Tt;
|
||||
T1T = TF - TQ;
|
||||
T1U = FMA(KP951056516, T1S, KP587785252 * T1T);
|
||||
T1W = FNMS(KP587785252, T1S, KP951056516 * T1T);
|
||||
ii[WS(rs, 5)] = T1P + T1O;
|
||||
T1V = T1Q - T1N;
|
||||
ii[WS(rs, 3)] = T1V - T1W;
|
||||
ii[WS(rs, 7)] = T1W + T1V;
|
||||
T1R = T1N + T1Q;
|
||||
ii[WS(rs, 1)] = T1R - T1U;
|
||||
ii[WS(rs, 9)] = T1U + T1R;
|
||||
}
|
||||
{
|
||||
E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
|
||||
T1m = KP559016994 * (TW - TZ);
|
||||
T10 = TW + TZ;
|
||||
T1l = FNMS(KP250000000, T10, TT);
|
||||
T1q = T1o - T1p;
|
||||
T1t = T1r - T1s;
|
||||
T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
|
||||
T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
|
||||
ri[0] = TT + T10;
|
||||
T1v = T1m + T1l;
|
||||
ri[WS(rs, 4)] = T1v - T1w;
|
||||
ri[WS(rs, 6)] = T1v + T1w;
|
||||
T1n = T1l - T1m;
|
||||
ri[WS(rs, 2)] = T1n - T1u;
|
||||
ri[WS(rs, 8)] = T1n + T1u;
|
||||
}
|
||||
{
|
||||
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
|
||||
T1H = KP559016994 * (T1x - T1y);
|
||||
T1z = T1x + T1y;
|
||||
T1G = FNMS(KP250000000, T1z, T1C);
|
||||
T1D = TX - TY;
|
||||
T1E = TU - TV;
|
||||
T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
|
||||
T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
|
||||
ii[0] = T1z + T1C;
|
||||
T1K = T1H + T1G;
|
||||
ii[WS(rs, 4)] = T1J + T1K;
|
||||
ii[WS(rs, 6)] = T1K - T1J;
|
||||
T1I = T1G - T1H;
|
||||
ii[WS(rs, 2)] = T1F + T1I;
|
||||
ii[WS(rs, 8)] = T1I - T1F;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 72, 30, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_10, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,581 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 68 FP multiplications,
|
||||
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2r, T1s, T2f, T1d, T21, T1H;
|
||||
E T1Z, Te, T2o, T1l, T2h, TT, T1V, T1A, T1T;
|
||||
T1 = ri[0];
|
||||
T2i = ii[0];
|
||||
{
|
||||
E Th, Tk, Ti, T2d, Tg, Tj;
|
||||
Th = ri[WS(rs, 6)];
|
||||
Tk = ii[WS(rs, 6)];
|
||||
Tg = W[10];
|
||||
Ti = Tg * Th;
|
||||
T2d = Tg * Tk;
|
||||
Tj = W[11];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2e = FNMS(Tj, Th, T2d);
|
||||
}
|
||||
{
|
||||
E TW, TZ, TX, T1X, TV, TY;
|
||||
TW = ri[WS(rs, 9)];
|
||||
TZ = ii[WS(rs, 9)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T1X = TV * TZ;
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T1Y = FNMS(TY, TW, T1X);
|
||||
}
|
||||
{
|
||||
E TC, TF, TD, T1R, TB, TE;
|
||||
TC = ri[WS(rs, 3)];
|
||||
TF = ii[WS(rs, 3)];
|
||||
TB = W[4];
|
||||
TD = TB * TC;
|
||||
T1R = TB * TF;
|
||||
TE = W[5];
|
||||
TG = FMA(TE, TF, TD);
|
||||
T1S = FNMS(TE, TC, T1R);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
|
||||
Tn = ri[WS(rs, 10)];
|
||||
Tq = ii[WS(rs, 10)];
|
||||
Tm = W[18];
|
||||
To = Tm * Tn;
|
||||
T1o = Tm * Tq;
|
||||
Tt = ri[WS(rs, 2)];
|
||||
Tw = ii[WS(rs, 2)];
|
||||
Ts = W[2];
|
||||
Tu = Ts * Tt;
|
||||
T1q = Ts * Tw;
|
||||
{
|
||||
E Tr, T1p, Tx, T1r, Tp, Tv;
|
||||
Tp = W[19];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1p = FNMS(Tp, Tn, T1o);
|
||||
Tv = W[3];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1r = FNMS(Tv, Tt, T1q);
|
||||
Ty = Tr + Tx;
|
||||
T2r = Tx - Tr;
|
||||
T1s = T1p - T1r;
|
||||
T2f = T1p + T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
|
||||
T12 = ri[WS(rs, 1)];
|
||||
T15 = ii[WS(rs, 1)];
|
||||
T11 = W[0];
|
||||
T13 = T11 * T12;
|
||||
T1D = T11 * T15;
|
||||
T18 = ri[WS(rs, 5)];
|
||||
T1b = ii[WS(rs, 5)];
|
||||
T17 = W[8];
|
||||
T19 = T17 * T18;
|
||||
T1F = T17 * T1b;
|
||||
{
|
||||
E T16, T1E, T1c, T1G, T14, T1a;
|
||||
T14 = W[1];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1E = FNMS(T14, T12, T1D);
|
||||
T1a = W[9];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T1G = FNMS(T1a, T18, T1F);
|
||||
T1d = T16 + T1c;
|
||||
T21 = T1c - T16;
|
||||
T1H = T1E - T1G;
|
||||
T1Z = T1E + T1G;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T6 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1h = T2 * T6;
|
||||
T9 = ri[WS(rs, 8)];
|
||||
Tc = ii[WS(rs, 8)];
|
||||
T8 = W[14];
|
||||
Ta = T8 * T9;
|
||||
T1j = T8 * Tc;
|
||||
{
|
||||
E T7, T1i, Td, T1k, T5, Tb;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1i = FNMS(T5, T3, T1h);
|
||||
Tb = W[15];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1k = FNMS(Tb, T9, T1j);
|
||||
Te = T7 + Td;
|
||||
T2o = Td - T7;
|
||||
T1l = T1i - T1k;
|
||||
T2h = T1i + T1k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
|
||||
TI = ri[WS(rs, 7)];
|
||||
TL = ii[WS(rs, 7)];
|
||||
TH = W[12];
|
||||
TJ = TH * TI;
|
||||
T1w = TH * TL;
|
||||
TO = ri[WS(rs, 11)];
|
||||
TR = ii[WS(rs, 11)];
|
||||
TN = W[20];
|
||||
TP = TN * TO;
|
||||
T1y = TN * TR;
|
||||
{
|
||||
E TM, T1x, TS, T1z, TK, TQ;
|
||||
TK = W[13];
|
||||
TM = FMA(TK, TL, TJ);
|
||||
T1x = FNMS(TK, TI, T1w);
|
||||
TQ = W[21];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1z = FNMS(TQ, TO, T1y);
|
||||
TT = TM + TS;
|
||||
T1V = TS - TM;
|
||||
T1A = T1x - T1z;
|
||||
T1T = T1x + T1z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
|
||||
{
|
||||
E Tf, Tz, T2g, T2j;
|
||||
Tf = T1 + Te;
|
||||
Tz = Tl + Ty;
|
||||
TA = Tf + Tz;
|
||||
T28 = Tf - Tz;
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2m = T2j - T2g;
|
||||
}
|
||||
{
|
||||
E TU, T1e, T29, T2a;
|
||||
TU = TG + TT;
|
||||
T1e = T10 + T1d;
|
||||
T1f = TU + T1e;
|
||||
T2l = TU - T1e;
|
||||
T29 = T1S + T1T;
|
||||
T2a = T1Y + T1Z;
|
||||
T2b = T29 - T2a;
|
||||
T2c = T29 + T2a;
|
||||
}
|
||||
ri[WS(rs, 6)] = TA - T1f;
|
||||
ii[WS(rs, 6)] = T2k - T2c;
|
||||
ri[0] = TA + T1f;
|
||||
ii[0] = T2c + T2k;
|
||||
ri[WS(rs, 3)] = T28 - T2b;
|
||||
ii[WS(rs, 3)] = T2l + T2m;
|
||||
ri[WS(rs, 9)] = T28 + T2b;
|
||||
ii[WS(rs, 9)] = T2m - T2l;
|
||||
}
|
||||
{
|
||||
E T1m, T1K, T2p, T2y, T2s, T2x, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
|
||||
E T1O;
|
||||
{
|
||||
E T1g, T2n, T2q, T1n;
|
||||
T1g = FNMS(KP500000000, Te, T1);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1K = FMA(KP866025403, T1l, T1g);
|
||||
T2n = FNMS(KP500000000, T2h, T2i);
|
||||
T2p = FMA(KP866025403, T2o, T2n);
|
||||
T2y = FNMS(KP866025403, T2o, T2n);
|
||||
T2q = FNMS(KP500000000, T2f, T2e);
|
||||
T2s = FMA(KP866025403, T2r, T2q);
|
||||
T2x = FNMS(KP866025403, T2r, T2q);
|
||||
T1n = FNMS(KP500000000, Ty, Tl);
|
||||
T1t = FNMS(KP866025403, T1s, T1n);
|
||||
T1L = FMA(KP866025403, T1s, T1n);
|
||||
}
|
||||
{
|
||||
E T1v, T1U, T20, T1C;
|
||||
T1v = FNMS(KP500000000, TT, TG);
|
||||
T1B = FNMS(KP866025403, T1A, T1v);
|
||||
T1N = FMA(KP866025403, T1A, T1v);
|
||||
T1U = FNMS(KP500000000, T1T, T1S);
|
||||
T1W = FMA(KP866025403, T1V, T1U);
|
||||
T25 = FNMS(KP866025403, T1V, T1U);
|
||||
T20 = FNMS(KP500000000, T1Z, T1Y);
|
||||
T22 = FMA(KP866025403, T21, T20);
|
||||
T26 = FNMS(KP866025403, T21, T20);
|
||||
T1C = FNMS(KP500000000, T1d, T10);
|
||||
T1I = FNMS(KP866025403, T1H, T1C);
|
||||
T1O = FMA(KP866025403, T1H, T1C);
|
||||
}
|
||||
{
|
||||
E T1u, T1J, T2z, T2A;
|
||||
T1u = T1m + T1t;
|
||||
T1J = T1B + T1I;
|
||||
ri[WS(rs, 2)] = T1u - T1J;
|
||||
ri[WS(rs, 8)] = T1u + T1J;
|
||||
T2z = T2x + T2y;
|
||||
T2A = T25 + T26;
|
||||
ii[WS(rs, 2)] = T2z - T2A;
|
||||
ii[WS(rs, 8)] = T2A + T2z;
|
||||
}
|
||||
{
|
||||
E T1M, T1P, T2v, T2w;
|
||||
T1M = T1K + T1L;
|
||||
T1P = T1N + T1O;
|
||||
ri[WS(rs, 10)] = T1M - T1P;
|
||||
ri[WS(rs, 4)] = T1M + T1P;
|
||||
T2v = T1W + T22;
|
||||
T2w = T2s + T2p;
|
||||
ii[WS(rs, 4)] = T2v + T2w;
|
||||
ii[WS(rs, 10)] = T2w - T2v;
|
||||
}
|
||||
{
|
||||
E T1Q, T23, T2t, T2u;
|
||||
T1Q = T1K - T1L;
|
||||
T23 = T1W - T22;
|
||||
ri[WS(rs, 7)] = T1Q - T23;
|
||||
ri[WS(rs, 1)] = T1Q + T23;
|
||||
T2t = T2p - T2s;
|
||||
T2u = T1N - T1O;
|
||||
ii[WS(rs, 1)] = T2t - T2u;
|
||||
ii[WS(rs, 7)] = T2u + T2t;
|
||||
}
|
||||
{
|
||||
E T24, T27, T2B, T2C;
|
||||
T24 = T1m - T1t;
|
||||
T27 = T25 - T26;
|
||||
ri[WS(rs, 11)] = T24 - T27;
|
||||
ri[WS(rs, 5)] = T24 + T27;
|
||||
T2B = T2y - T2x;
|
||||
T2C = T1B - T1I;
|
||||
ii[WS(rs, 5)] = T2B - T2C;
|
||||
ii[WS(rs, 11)] = T2C + T2B;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 72, 22, 46, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_12) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_12, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 60 FP multiplications,
|
||||
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
|
||||
E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g;
|
||||
E T1A, T1B;
|
||||
{
|
||||
E T6, T16, Tb, T17;
|
||||
T1 = ri[0];
|
||||
T1W = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T5 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T16 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 8)];
|
||||
Ta = ii[WS(rs, 8)];
|
||||
T7 = W[14];
|
||||
T9 = W[15];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T17 = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T18 = KP866025403 * (T16 - T17);
|
||||
T21 = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
T15 = FNMS(KP500000000, Tc, T1);
|
||||
T1V = T16 + T17;
|
||||
T22 = FNMS(KP500000000, T1V, T1W);
|
||||
}
|
||||
{
|
||||
E T11, T1n, TW, T1m;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = ri[WS(rs, 9)];
|
||||
TQ = ii[WS(rs, 9)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1E = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = ri[WS(rs, 5)];
|
||||
T10 = ii[WS(rs, 5)];
|
||||
TX = W[8];
|
||||
TZ = W[9];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1n = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = ri[WS(rs, 1)];
|
||||
TV = ii[WS(rs, 1)];
|
||||
TS = W[0];
|
||||
TU = W[1];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1m = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
T1o = KP866025403 * (T1m - T1n);
|
||||
T1D = KP866025403 * (T11 - TW);
|
||||
T12 = TW + T11;
|
||||
T1l = FNMS(KP500000000, T12, TR);
|
||||
T1F = T1m + T1n;
|
||||
T1G = FNMS(KP500000000, T1F, T1E);
|
||||
}
|
||||
{
|
||||
E Ts, T1c, Tn, T1b;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = ri[WS(rs, 6)];
|
||||
Th = ii[WS(rs, 6)];
|
||||
Te = W[10];
|
||||
Tg = W[11];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T1S = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 2)];
|
||||
Tr = ii[WS(rs, 2)];
|
||||
To = W[2];
|
||||
Tq = W[3];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1c = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 10)];
|
||||
Tm = ii[WS(rs, 10)];
|
||||
Tj = W[18];
|
||||
Tl = W[19];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1b = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
T1d = KP866025403 * (T1b - T1c);
|
||||
T24 = KP866025403 * (Ts - Tn);
|
||||
Tt = Tn + Ts;
|
||||
T1a = FNMS(KP500000000, Tt, Ti);
|
||||
T1T = T1b + T1c;
|
||||
T25 = FNMS(KP500000000, T1T, T1S);
|
||||
}
|
||||
{
|
||||
E TK, T1i, TF, T1h;
|
||||
{
|
||||
E Tx, Tz, Tw, Ty;
|
||||
Tx = ri[WS(rs, 3)];
|
||||
Tz = ii[WS(rs, 3)];
|
||||
Tw = W[4];
|
||||
Ty = W[5];
|
||||
TA = FMA(Tw, Tx, Ty * Tz);
|
||||
T1z = FNMS(Ty, Tx, Tw * Tz);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = ri[WS(rs, 11)];
|
||||
TJ = ii[WS(rs, 11)];
|
||||
TG = W[20];
|
||||
TI = W[21];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1i = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
{
|
||||
E TC, TE, TB, TD;
|
||||
TC = ri[WS(rs, 7)];
|
||||
TE = ii[WS(rs, 7)];
|
||||
TB = W[12];
|
||||
TD = W[13];
|
||||
TF = FMA(TB, TC, TD * TE);
|
||||
T1h = FNMS(TD, TC, TB * TE);
|
||||
}
|
||||
T1j = KP866025403 * (T1h - T1i);
|
||||
T1y = KP866025403 * (TK - TF);
|
||||
TL = TF + TK;
|
||||
T1g = FNMS(KP500000000, TL, TA);
|
||||
T1A = T1h + T1i;
|
||||
T1B = FNMS(KP500000000, T1A, T1z);
|
||||
}
|
||||
{
|
||||
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
|
||||
{
|
||||
E Td, Tu, T1U, T1X;
|
||||
Td = T1 + Tc;
|
||||
Tu = Ti + Tt;
|
||||
Tv = Td + Tu;
|
||||
T1N = Td - Tu;
|
||||
T1U = T1S + T1T;
|
||||
T1X = T1V + T1W;
|
||||
T1Y = T1U + T1X;
|
||||
T20 = T1X - T1U;
|
||||
}
|
||||
{
|
||||
E TM, T13, T1O, T1P;
|
||||
TM = TA + TL;
|
||||
T13 = TR + T12;
|
||||
T14 = TM + T13;
|
||||
T1Z = TM - T13;
|
||||
T1O = T1z + T1A;
|
||||
T1P = T1E + T1F;
|
||||
T1Q = T1O - T1P;
|
||||
T1R = T1O + T1P;
|
||||
}
|
||||
ri[WS(rs, 6)] = Tv - T14;
|
||||
ii[WS(rs, 6)] = T1Y - T1R;
|
||||
ri[0] = Tv + T14;
|
||||
ii[0] = T1R + T1Y;
|
||||
ri[WS(rs, 3)] = T1N - T1Q;
|
||||
ii[WS(rs, 3)] = T1Z + T20;
|
||||
ri[WS(rs, 9)] = T1N + T1Q;
|
||||
ii[WS(rs, 9)] = T20 - T1Z;
|
||||
}
|
||||
{
|
||||
E T1t, T1x, T27, T2a, T1w, T28, T1I, T29;
|
||||
{
|
||||
E T1r, T1s, T23, T26;
|
||||
T1r = T15 + T18;
|
||||
T1s = T1a + T1d;
|
||||
T1t = T1r + T1s;
|
||||
T1x = T1r - T1s;
|
||||
T23 = T21 + T22;
|
||||
T26 = T24 + T25;
|
||||
T27 = T23 - T26;
|
||||
T2a = T26 + T23;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1C, T1H;
|
||||
T1u = T1g + T1j;
|
||||
T1v = T1l + T1o;
|
||||
T1w = T1u + T1v;
|
||||
T28 = T1u - T1v;
|
||||
T1C = T1y + T1B;
|
||||
T1H = T1D + T1G;
|
||||
T1I = T1C - T1H;
|
||||
T29 = T1C + T1H;
|
||||
}
|
||||
ri[WS(rs, 10)] = T1t - T1w;
|
||||
ii[WS(rs, 10)] = T2a - T29;
|
||||
ri[WS(rs, 4)] = T1t + T1w;
|
||||
ii[WS(rs, 4)] = T29 + T2a;
|
||||
ri[WS(rs, 7)] = T1x - T1I;
|
||||
ii[WS(rs, 7)] = T28 + T27;
|
||||
ri[WS(rs, 1)] = T1x + T1I;
|
||||
ii[WS(rs, 1)] = T27 - T28;
|
||||
}
|
||||
{
|
||||
E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e;
|
||||
{
|
||||
E T19, T1e, T2b, T2c;
|
||||
T19 = T15 - T18;
|
||||
T1e = T1a - T1d;
|
||||
T1f = T19 + T1e;
|
||||
T1J = T19 - T1e;
|
||||
T2b = T25 - T24;
|
||||
T2c = T22 - T21;
|
||||
T2d = T2b + T2c;
|
||||
T2f = T2c - T2b;
|
||||
}
|
||||
{
|
||||
E T1k, T1p, T1K, T1L;
|
||||
T1k = T1g - T1j;
|
||||
T1p = T1l - T1o;
|
||||
T1q = T1k + T1p;
|
||||
T2g = T1k - T1p;
|
||||
T1K = T1B - T1y;
|
||||
T1L = T1G - T1D;
|
||||
T1M = T1K - T1L;
|
||||
T2e = T1K + T1L;
|
||||
}
|
||||
ri[WS(rs, 2)] = T1f - T1q;
|
||||
ii[WS(rs, 2)] = T2d - T2e;
|
||||
ri[WS(rs, 8)] = T1f + T1q;
|
||||
ii[WS(rs, 8)] = T2e + T2d;
|
||||
ri[WS(rs, 11)] = T1J - T1M;
|
||||
ii[WS(rs, 11)] = T2g + T2f;
|
||||
ri[WS(rs, 5)] = T1J + T1M;
|
||||
ii[WS(rs, 5)] = T2f - T2g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 88, 30, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_12) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_12, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,816 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 140 FP multiplications,
|
||||
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
|
||||
* 51 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T1, T3j, T1G, T3u, Te, T1B, T3i, T3t, T1y, T2i, T2a, T2M, T37, T2V, Tz;
|
||||
E T2e, T1O, T2t, T39, T2X, TT, T2f, T1V, T2z, T3a, T2Y, T1e, T2h, T23, T2G;
|
||||
E T36, T2U;
|
||||
{
|
||||
E T7, T1D, Td, T1F;
|
||||
T1 = ri[0];
|
||||
T3j = ii[0];
|
||||
{
|
||||
E T3, T6, T4, T1C, T2, T5;
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T6 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1C = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1D = FNMS(T5, T3, T1C);
|
||||
}
|
||||
{
|
||||
E T9, Tc, Ta, T1E, T8, Tb;
|
||||
T9 = ri[WS(rs, 10)];
|
||||
Tc = ii[WS(rs, 10)];
|
||||
T8 = W[18];
|
||||
Ta = T8 * T9;
|
||||
T1E = T8 * Tc;
|
||||
Tb = W[19];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1F = FNMS(Tb, T9, T1E);
|
||||
}
|
||||
T1G = T1D - T1F;
|
||||
T3u = Td - T7;
|
||||
Te = T7 + Td;
|
||||
T1B = FNMS(KP500000000, Te, T1);
|
||||
T3i = T1D + T1F;
|
||||
T3t = FNMS(KP500000000, T3i, T3j);
|
||||
}
|
||||
{
|
||||
E T1k, T2I, T1w, T28, T1q, T26;
|
||||
{
|
||||
E T1g, T1j, T1h, T2H, T1f, T1i;
|
||||
T1g = ri[WS(rs, 9)];
|
||||
T1j = ii[WS(rs, 9)];
|
||||
T1f = W[16];
|
||||
T1h = T1f * T1g;
|
||||
T2H = T1f * T1j;
|
||||
T1i = W[17];
|
||||
T1k = FMA(T1i, T1j, T1h);
|
||||
T2I = FNMS(T1i, T1g, T2H);
|
||||
}
|
||||
{
|
||||
E T1s, T1v, T1t, T27, T1r, T1u;
|
||||
T1s = ri[WS(rs, 4)];
|
||||
T1v = ii[WS(rs, 4)];
|
||||
T1r = W[6];
|
||||
T1t = T1r * T1s;
|
||||
T27 = T1r * T1v;
|
||||
T1u = W[7];
|
||||
T1w = FMA(T1u, T1v, T1t);
|
||||
T28 = FNMS(T1u, T1s, T27);
|
||||
}
|
||||
{
|
||||
E T1m, T1p, T1n, T25, T1l, T1o;
|
||||
T1m = ri[WS(rs, 14)];
|
||||
T1p = ii[WS(rs, 14)];
|
||||
T1l = W[26];
|
||||
T1n = T1l * T1m;
|
||||
T25 = T1l * T1p;
|
||||
T1o = W[27];
|
||||
T1q = FMA(T1o, T1p, T1n);
|
||||
T26 = FNMS(T1o, T1m, T25);
|
||||
}
|
||||
{
|
||||
E T29, T1x, T24, T2L, T2J, T2K;
|
||||
T29 = T26 - T28;
|
||||
T1x = T1q + T1w;
|
||||
T24 = FNMS(KP500000000, T1x, T1k);
|
||||
T1y = T1k + T1x;
|
||||
T2i = FMA(KP866025403, T29, T24);
|
||||
T2a = FNMS(KP866025403, T29, T24);
|
||||
T2L = T1w - T1q;
|
||||
T2J = T26 + T28;
|
||||
T2K = FNMS(KP500000000, T2J, T2I);
|
||||
T2M = FMA(KP866025403, T2L, T2K);
|
||||
T37 = T2I + T2J;
|
||||
T2V = FNMS(KP866025403, T2L, T2K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tl, T2p, Tx, T1M, Tr, T1K;
|
||||
{
|
||||
E Th, Tk, Ti, T2o, Tg, Tj;
|
||||
Th = ri[WS(rs, 3)];
|
||||
Tk = ii[WS(rs, 3)];
|
||||
Tg = W[4];
|
||||
Ti = Tg * Th;
|
||||
T2o = Tg * Tk;
|
||||
Tj = W[5];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2p = FNMS(Tj, Th, T2o);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1L, Ts, Tv;
|
||||
Tt = ri[WS(rs, 13)];
|
||||
Tw = ii[WS(rs, 13)];
|
||||
Ts = W[24];
|
||||
Tu = Ts * Tt;
|
||||
T1L = Ts * Tw;
|
||||
Tv = W[25];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1M = FNMS(Tv, Tt, T1L);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1J, Tm, Tp;
|
||||
Tn = ri[WS(rs, 8)];
|
||||
Tq = ii[WS(rs, 8)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1J = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1K = FNMS(Tp, Tn, T1J);
|
||||
}
|
||||
{
|
||||
E T1N, Ty, T1I, T2s, T2q, T2r;
|
||||
T1N = T1K - T1M;
|
||||
Ty = Tr + Tx;
|
||||
T1I = FNMS(KP500000000, Ty, Tl);
|
||||
Tz = Tl + Ty;
|
||||
T2e = FMA(KP866025403, T1N, T1I);
|
||||
T1O = FNMS(KP866025403, T1N, T1I);
|
||||
T2s = Tx - Tr;
|
||||
T2q = T1K + T1M;
|
||||
T2r = FNMS(KP500000000, T2q, T2p);
|
||||
T2t = FMA(KP866025403, T2s, T2r);
|
||||
T39 = T2p + T2q;
|
||||
T2X = FNMS(KP866025403, T2s, T2r);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TF, T2v, TR, T1T, TL, T1R;
|
||||
{
|
||||
E TB, TE, TC, T2u, TA, TD;
|
||||
TB = ri[WS(rs, 12)];
|
||||
TE = ii[WS(rs, 12)];
|
||||
TA = W[22];
|
||||
TC = TA * TB;
|
||||
T2u = TA * TE;
|
||||
TD = W[23];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T2v = FNMS(TD, TB, T2u);
|
||||
}
|
||||
{
|
||||
E TN, TQ, TO, T1S, TM, TP;
|
||||
TN = ri[WS(rs, 7)];
|
||||
TQ = ii[WS(rs, 7)];
|
||||
TM = W[12];
|
||||
TO = TM * TN;
|
||||
T1S = TM * TQ;
|
||||
TP = W[13];
|
||||
TR = FMA(TP, TQ, TO);
|
||||
T1T = FNMS(TP, TN, T1S);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1Q, TG, TJ;
|
||||
TH = ri[WS(rs, 2)];
|
||||
TK = ii[WS(rs, 2)];
|
||||
TG = W[2];
|
||||
TI = TG * TH;
|
||||
T1Q = TG * TK;
|
||||
TJ = W[3];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1R = FNMS(TJ, TH, T1Q);
|
||||
}
|
||||
{
|
||||
E T1U, TS, T1P, T2y, T2w, T2x;
|
||||
T1U = T1R - T1T;
|
||||
TS = TL + TR;
|
||||
T1P = FNMS(KP500000000, TS, TF);
|
||||
TT = TF + TS;
|
||||
T2f = FMA(KP866025403, T1U, T1P);
|
||||
T1V = FNMS(KP866025403, T1U, T1P);
|
||||
T2y = TR - TL;
|
||||
T2w = T1R + T1T;
|
||||
T2x = FNMS(KP500000000, T2w, T2v);
|
||||
T2z = FMA(KP866025403, T2y, T2x);
|
||||
T3a = T2v + T2w;
|
||||
T2Y = FNMS(KP866025403, T2y, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T10, T2C, T1c, T21, T16, T1Z;
|
||||
{
|
||||
E TW, TZ, TX, T2B, TV, TY;
|
||||
TW = ri[WS(rs, 6)];
|
||||
TZ = ii[WS(rs, 6)];
|
||||
TV = W[10];
|
||||
TX = TV * TW;
|
||||
T2B = TV * TZ;
|
||||
TY = W[11];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T2C = FNMS(TY, TW, T2B);
|
||||
}
|
||||
{
|
||||
E T18, T1b, T19, T20, T17, T1a;
|
||||
T18 = ri[WS(rs, 1)];
|
||||
T1b = ii[WS(rs, 1)];
|
||||
T17 = W[0];
|
||||
T19 = T17 * T18;
|
||||
T20 = T17 * T1b;
|
||||
T1a = W[1];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T21 = FNMS(T1a, T18, T20);
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1Y, T11, T14;
|
||||
T12 = ri[WS(rs, 11)];
|
||||
T15 = ii[WS(rs, 11)];
|
||||
T11 = W[20];
|
||||
T13 = T11 * T12;
|
||||
T1Y = T11 * T15;
|
||||
T14 = W[21];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1Z = FNMS(T14, T12, T1Y);
|
||||
}
|
||||
{
|
||||
E T22, T1d, T1X, T2F, T2D, T2E;
|
||||
T22 = T1Z - T21;
|
||||
T1d = T16 + T1c;
|
||||
T1X = FNMS(KP500000000, T1d, T10);
|
||||
T1e = T10 + T1d;
|
||||
T2h = FMA(KP866025403, T22, T1X);
|
||||
T23 = FNMS(KP866025403, T22, T1X);
|
||||
T2F = T1c - T16;
|
||||
T2D = T1Z + T21;
|
||||
T2E = FNMS(KP500000000, T2D, T2C);
|
||||
T2G = FMA(KP866025403, T2F, T2E);
|
||||
T36 = T2C + T2D;
|
||||
T2U = FNMS(KP866025403, T2F, T2E);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
|
||||
{
|
||||
E T38, T3b, TU, T1z;
|
||||
T38 = T36 - T37;
|
||||
T3b = T39 - T3a;
|
||||
T3c = FNMS(KP618033988, T3b, T38);
|
||||
T3e = FMA(KP618033988, T38, T3b);
|
||||
Tf = T1 + Te;
|
||||
TU = Tz + TT;
|
||||
T1z = T1e + T1y;
|
||||
T1A = TU + T1z;
|
||||
T33 = FNMS(KP250000000, T1A, Tf);
|
||||
T34 = TU - T1z;
|
||||
}
|
||||
ri[0] = Tf + T1A;
|
||||
T3d = FMA(KP559016994, T34, T33);
|
||||
ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
|
||||
ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
|
||||
T35 = FNMS(KP559016994, T34, T33);
|
||||
ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
|
||||
}
|
||||
{
|
||||
E T3q, T3s, T3k, T3h, T3l, T3m, T3r, T3n;
|
||||
{
|
||||
E T3o, T3p, T3f, T3g;
|
||||
T3o = T1e - T1y;
|
||||
T3p = Tz - TT;
|
||||
T3q = FNMS(KP618033988, T3p, T3o);
|
||||
T3s = FMA(KP618033988, T3o, T3p);
|
||||
T3k = T3i + T3j;
|
||||
T3f = T39 + T3a;
|
||||
T3g = T36 + T37;
|
||||
T3h = T3f + T3g;
|
||||
T3l = FNMS(KP250000000, T3h, T3k);
|
||||
T3m = T3f - T3g;
|
||||
}
|
||||
ii[0] = T3h + T3k;
|
||||
T3r = FMA(KP559016994, T3m, T3l);
|
||||
ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
|
||||
ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
|
||||
T3n = FNMS(KP559016994, T3m, T3l);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
|
||||
ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
|
||||
}
|
||||
{
|
||||
E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
|
||||
{
|
||||
E T2W, T2Z, T1W, T2b;
|
||||
T2W = T2U - T2V;
|
||||
T2Z = T2X - T2Y;
|
||||
T30 = FNMS(KP618033988, T2Z, T2W);
|
||||
T32 = FMA(KP618033988, T2W, T2Z);
|
||||
T1H = FNMS(KP866025403, T1G, T1B);
|
||||
T1W = T1O + T1V;
|
||||
T2b = T23 + T2a;
|
||||
T2c = T1W + T2b;
|
||||
T2R = FNMS(KP250000000, T2c, T1H);
|
||||
T2S = T1W - T2b;
|
||||
}
|
||||
ri[WS(rs, 5)] = T1H + T2c;
|
||||
T31 = FMA(KP559016994, T2S, T2R);
|
||||
ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
|
||||
ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
|
||||
T2T = FNMS(KP559016994, T2S, T2R);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
|
||||
ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
|
||||
}
|
||||
{
|
||||
E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
|
||||
{
|
||||
E T3O, T3P, T3I, T3J;
|
||||
T3O = T23 - T2a;
|
||||
T3P = T1O - T1V;
|
||||
T3Q = FNMS(KP618033988, T3P, T3O);
|
||||
T3S = FMA(KP618033988, T3O, T3P);
|
||||
T3H = FNMS(KP866025403, T3u, T3t);
|
||||
T3I = T2X + T2Y;
|
||||
T3J = T2U + T2V;
|
||||
T3K = T3I + T3J;
|
||||
T3L = FNMS(KP250000000, T3K, T3H);
|
||||
T3M = T3I - T3J;
|
||||
}
|
||||
ii[WS(rs, 5)] = T3K + T3H;
|
||||
T3R = FMA(KP559016994, T3M, T3L);
|
||||
ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
|
||||
ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
|
||||
T3N = FNMS(KP559016994, T3M, T3L);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
|
||||
ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
|
||||
}
|
||||
{
|
||||
E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
|
||||
{
|
||||
E T3C, T3D, T3w, T3x;
|
||||
T3C = T2e - T2f;
|
||||
T3D = T2h - T2i;
|
||||
T3E = FMA(KP618033988, T3D, T3C);
|
||||
T3G = FNMS(KP618033988, T3C, T3D);
|
||||
T3v = FMA(KP866025403, T3u, T3t);
|
||||
T3w = T2t + T2z;
|
||||
T3x = T2G + T2M;
|
||||
T3y = T3w + T3x;
|
||||
T3z = FNMS(KP250000000, T3y, T3v);
|
||||
T3A = T3w - T3x;
|
||||
}
|
||||
ii[WS(rs, 10)] = T3y + T3v;
|
||||
T3F = FNMS(KP559016994, T3A, T3z);
|
||||
ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
|
||||
ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
|
||||
T3B = FMA(KP559016994, T3A, T3z);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
|
||||
}
|
||||
{
|
||||
E T2O, T2Q, T2d, T2k, T2l, T2m, T2P, T2n;
|
||||
{
|
||||
E T2A, T2N, T2g, T2j;
|
||||
T2A = T2t - T2z;
|
||||
T2N = T2G - T2M;
|
||||
T2O = FMA(KP618033988, T2N, T2A);
|
||||
T2Q = FNMS(KP618033988, T2A, T2N);
|
||||
T2d = FMA(KP866025403, T1G, T1B);
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2l = FNMS(KP250000000, T2k, T2d);
|
||||
T2m = T2g - T2j;
|
||||
}
|
||||
ri[WS(rs, 10)] = T2d + T2k;
|
||||
T2P = FNMS(KP559016994, T2m, T2l);
|
||||
ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
|
||||
ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
|
||||
T2n = FMA(KP559016994, T2m, T2l);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 72, 28, 112, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_15) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_15, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 112 FP multiplications,
|
||||
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
|
||||
* 65 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
|
||||
E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
|
||||
E T24, T2v, T1B, T1R;
|
||||
{
|
||||
E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
|
||||
T1 = ri[0];
|
||||
T2R = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T5 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1o = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 10)];
|
||||
Ta = ii[WS(rs, 10)];
|
||||
T7 = W[18];
|
||||
T9 = W[19];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T1p = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T1q = KP866025403 * (T1o - T1p);
|
||||
T34 = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
Td = T1 + Tc;
|
||||
T1n = FNMS(KP500000000, Tc, T1);
|
||||
T2Q = T1o + T1p;
|
||||
T2S = T2Q + T2R;
|
||||
T35 = FNMS(KP500000000, T2Q, T2R);
|
||||
}
|
||||
{
|
||||
E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
|
||||
E T2i;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = ri[WS(rs, 6)];
|
||||
TQ = ii[WS(rs, 6)];
|
||||
TN = W[10];
|
||||
TP = W[11];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T2c = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E T15, T17, T14, T16;
|
||||
T15 = ri[WS(rs, 9)];
|
||||
T17 = ii[WS(rs, 9)];
|
||||
T14 = W[16];
|
||||
T16 = W[17];
|
||||
T18 = FMA(T14, T15, T16 * T17);
|
||||
T2h = FNMS(T16, T15, T14 * T17);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = ri[WS(rs, 11)];
|
||||
TV = ii[WS(rs, 11)];
|
||||
TS = W[20];
|
||||
TU = W[21];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1E = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = ri[WS(rs, 1)];
|
||||
T10 = ii[WS(rs, 1)];
|
||||
TX = W[0];
|
||||
TZ = W[1];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1F = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
T12 = TW + T11;
|
||||
T2d = T1E + T1F;
|
||||
{
|
||||
E T1a, T1c, T19, T1b;
|
||||
T1a = ri[WS(rs, 14)];
|
||||
T1c = ii[WS(rs, 14)];
|
||||
T19 = W[26];
|
||||
T1b = W[27];
|
||||
T1d = FMA(T19, T1a, T1b * T1c);
|
||||
T1J = FNMS(T1b, T1a, T19 * T1c);
|
||||
}
|
||||
{
|
||||
E T1f, T1h, T1e, T1g;
|
||||
T1f = ri[WS(rs, 4)];
|
||||
T1h = ii[WS(rs, 4)];
|
||||
T1e = W[6];
|
||||
T1g = W[7];
|
||||
T1i = FMA(T1e, T1f, T1g * T1h);
|
||||
T1K = FNMS(T1g, T1f, T1e * T1h);
|
||||
}
|
||||
T1j = T1d + T1i;
|
||||
T2i = T1J + T1K;
|
||||
{
|
||||
E T1D, T1G, T2g, T2j;
|
||||
T13 = TR + T12;
|
||||
T1k = T18 + T1j;
|
||||
T1l = T13 + T1k;
|
||||
T2E = T2c + T2d;
|
||||
T2F = T2h + T2i;
|
||||
T2O = T2E + T2F;
|
||||
T1D = FNMS(KP500000000, T12, TR);
|
||||
T1G = KP866025403 * (T1E - T1F);
|
||||
T1H = T1D - T1G;
|
||||
T1T = T1D + T1G;
|
||||
T2g = KP866025403 * (T1i - T1d);
|
||||
T2j = FNMS(KP500000000, T2i, T2h);
|
||||
T2k = T2g + T2j;
|
||||
T2t = T2j - T2g;
|
||||
{
|
||||
E T2b, T2e, T1I, T1L;
|
||||
T2b = KP866025403 * (T11 - TW);
|
||||
T2e = FNMS(KP500000000, T2d, T2c);
|
||||
T2f = T2b + T2e;
|
||||
T2s = T2e - T2b;
|
||||
T1I = FNMS(KP500000000, T1j, T18);
|
||||
T1L = KP866025403 * (T1J - T1K);
|
||||
T1M = T1I - T1L;
|
||||
T1U = T1I + T1L;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
|
||||
E T27;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = ri[WS(rs, 3)];
|
||||
Th = ii[WS(rs, 3)];
|
||||
Te = W[4];
|
||||
Tg = W[5];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T21 = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 12)];
|
||||
Ty = ii[WS(rs, 12)];
|
||||
Tv = W[22];
|
||||
Tx = W[23];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T26 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 8)];
|
||||
Tm = ii[WS(rs, 8)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1t = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 13)];
|
||||
Tr = ii[WS(rs, 13)];
|
||||
To = W[24];
|
||||
Tq = W[25];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1u = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn + Ts;
|
||||
T22 = T1t + T1u;
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 2)];
|
||||
TD = ii[WS(rs, 2)];
|
||||
TA = W[2];
|
||||
TC = W[3];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1y = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = ri[WS(rs, 7)];
|
||||
TI = ii[WS(rs, 7)];
|
||||
TF = W[12];
|
||||
TH = W[13];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T1z = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
TK = TE + TJ;
|
||||
T27 = T1y + T1z;
|
||||
{
|
||||
E T1s, T1v, T25, T28;
|
||||
Tu = Ti + Tt;
|
||||
TL = Tz + TK;
|
||||
TM = Tu + TL;
|
||||
T2H = T21 + T22;
|
||||
T2I = T26 + T27;
|
||||
T2N = T2H + T2I;
|
||||
T1s = FNMS(KP500000000, Tt, Ti);
|
||||
T1v = KP866025403 * (T1t - T1u);
|
||||
T1w = T1s - T1v;
|
||||
T1Q = T1s + T1v;
|
||||
T25 = KP866025403 * (TJ - TE);
|
||||
T28 = FNMS(KP500000000, T27, T26);
|
||||
T29 = T25 + T28;
|
||||
T2w = T28 - T25;
|
||||
{
|
||||
E T20, T23, T1x, T1A;
|
||||
T20 = KP866025403 * (Ts - Tn);
|
||||
T23 = FNMS(KP500000000, T22, T21);
|
||||
T24 = T20 + T23;
|
||||
T2v = T23 - T20;
|
||||
T1x = FNMS(KP500000000, TK, Tz);
|
||||
T1A = KP866025403 * (T1y - T1z);
|
||||
T1B = T1x - T1A;
|
||||
T1R = T1x + T1A;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
|
||||
T2C = KP559016994 * (TM - T1l);
|
||||
T1m = TM + T1l;
|
||||
T2B = FNMS(KP250000000, T1m, Td);
|
||||
T2G = T2E - T2F;
|
||||
T2J = T2H - T2I;
|
||||
T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
|
||||
T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
|
||||
ri[0] = Td + T1m;
|
||||
T2L = T2C + T2B;
|
||||
ri[WS(rs, 9)] = T2L - T2M;
|
||||
ri[WS(rs, 6)] = T2L + T2M;
|
||||
T2D = T2B - T2C;
|
||||
ri[WS(rs, 12)] = T2D - T2K;
|
||||
ri[WS(rs, 3)] = T2D + T2K;
|
||||
}
|
||||
{
|
||||
E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
|
||||
T2U = KP559016994 * (T2N - T2O);
|
||||
T2P = T2N + T2O;
|
||||
T2T = FNMS(KP250000000, T2P, T2S);
|
||||
T2W = T13 - T1k;
|
||||
T2X = Tu - TL;
|
||||
T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
|
||||
T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
|
||||
ii[0] = T2P + T2S;
|
||||
T2Z = T2U + T2T;
|
||||
ii[WS(rs, 6)] = T2Z - T30;
|
||||
ii[WS(rs, 9)] = T30 + T2Z;
|
||||
T2V = T2T - T2U;
|
||||
ii[WS(rs, 3)] = T2V - T2Y;
|
||||
ii[WS(rs, 12)] = T2Y + T2V;
|
||||
}
|
||||
{
|
||||
E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
|
||||
{
|
||||
E T2u, T2x, T1C, T1N;
|
||||
T2u = T2s - T2t;
|
||||
T2x = T2v - T2w;
|
||||
T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
|
||||
T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
|
||||
T1r = T1n - T1q;
|
||||
T1C = T1w + T1B;
|
||||
T1N = T1H + T1M;
|
||||
T1O = T1C + T1N;
|
||||
T2p = FNMS(KP250000000, T1O, T1r);
|
||||
T2q = KP559016994 * (T1C - T1N);
|
||||
}
|
||||
ri[WS(rs, 5)] = T1r + T1O;
|
||||
T2z = T2q + T2p;
|
||||
ri[WS(rs, 14)] = T2z - T2A;
|
||||
ri[WS(rs, 11)] = T2z + T2A;
|
||||
T2r = T2p - T2q;
|
||||
ri[WS(rs, 2)] = T2r - T2y;
|
||||
ri[WS(rs, 8)] = T2r + T2y;
|
||||
}
|
||||
{
|
||||
E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
|
||||
{
|
||||
E T3f, T3g, T3j, T3k;
|
||||
T3f = T1H - T1M;
|
||||
T3g = T1w - T1B;
|
||||
T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
|
||||
T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
|
||||
T3i = T35 - T34;
|
||||
T3j = T2v + T2w;
|
||||
T3k = T2s + T2t;
|
||||
T3l = T3j + T3k;
|
||||
T3m = FNMS(KP250000000, T3l, T3i);
|
||||
T3n = KP559016994 * (T3j - T3k);
|
||||
}
|
||||
ii[WS(rs, 5)] = T3l + T3i;
|
||||
T3p = T3n + T3m;
|
||||
ii[WS(rs, 11)] = T3p - T3q;
|
||||
ii[WS(rs, 14)] = T3q + T3p;
|
||||
T3o = T3m - T3n;
|
||||
ii[WS(rs, 2)] = T3h + T3o;
|
||||
ii[WS(rs, 8)] = T3o - T3h;
|
||||
}
|
||||
{
|
||||
E T3c, T3d, T36, T37, T33, T38, T3e, T39;
|
||||
{
|
||||
E T3a, T3b, T31, T32;
|
||||
T3a = T1Q - T1R;
|
||||
T3b = T1T - T1U;
|
||||
T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
|
||||
T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
|
||||
T36 = T34 + T35;
|
||||
T31 = T24 + T29;
|
||||
T32 = T2f + T2k;
|
||||
T37 = T31 + T32;
|
||||
T33 = KP559016994 * (T31 - T32);
|
||||
T38 = FNMS(KP250000000, T37, T36);
|
||||
}
|
||||
ii[WS(rs, 10)] = T37 + T36;
|
||||
T3e = T38 - T33;
|
||||
ii[WS(rs, 7)] = T3d + T3e;
|
||||
ii[WS(rs, 13)] = T3e - T3d;
|
||||
T39 = T33 + T38;
|
||||
ii[WS(rs, 1)] = T39 - T3c;
|
||||
ii[WS(rs, 4)] = T3c + T39;
|
||||
}
|
||||
{
|
||||
E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
|
||||
{
|
||||
E T2a, T2l, T1S, T1V;
|
||||
T2a = T24 - T29;
|
||||
T2l = T2f - T2k;
|
||||
T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
|
||||
T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
|
||||
T1P = T1n + T1q;
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T1W = T1S + T1V;
|
||||
T1X = KP559016994 * (T1S - T1V);
|
||||
T1Y = FNMS(KP250000000, T1W, T1P);
|
||||
}
|
||||
ri[WS(rs, 10)] = T1P + T1W;
|
||||
T2n = T1Y - T1X;
|
||||
ri[WS(rs, 7)] = T2n - T2o;
|
||||
ri[WS(rs, 13)] = T2n + T2o;
|
||||
T1Z = T1X + T1Y;
|
||||
ri[WS(rs, 4)] = T1Z - T2m;
|
||||
ri[WS(rs, 1)] = T1Z + T2m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 128, 56, 56, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_15) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_15, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,796 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 100 FP multiplications,
|
||||
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 60 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N;
|
||||
E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
|
||||
E T1W, T21;
|
||||
{
|
||||
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
|
||||
T1 = ri[0];
|
||||
T3n = ii[0];
|
||||
T3 = ri[WS(rs, 8)];
|
||||
T6 = ii[WS(rs, 8)];
|
||||
T2 = W[14];
|
||||
T4 = T2 * T3;
|
||||
T3l = T2 * T6;
|
||||
T5 = W[15];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T3m = FNMS(T5, T3, T3l);
|
||||
T8 = T1 + T7;
|
||||
T3z = T3n - T3m;
|
||||
T1I = T1 - T7;
|
||||
T3o = T3m + T3n;
|
||||
}
|
||||
{
|
||||
E T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m;
|
||||
T1h = ri[WS(rs, 15)];
|
||||
T1k = ii[WS(rs, 15)];
|
||||
T1g = W[28];
|
||||
T1i = T1g * T1h;
|
||||
T2k = T1g * T1k;
|
||||
T1n = ri[WS(rs, 7)];
|
||||
T1q = ii[WS(rs, 7)];
|
||||
T1m = W[12];
|
||||
T1o = T1m * T1n;
|
||||
T2m = T1m * T1q;
|
||||
{
|
||||
E T1l, T2l, T1r, T2n, T1j, T1p;
|
||||
T1j = W[29];
|
||||
T1l = FMA(T1j, T1k, T1i);
|
||||
T2l = FNMS(T1j, T1h, T2k);
|
||||
T1p = W[13];
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T2n = FNMS(T1p, T1n, T2m);
|
||||
T1s = T1l + T1r;
|
||||
T35 = T2l + T2n;
|
||||
T2o = T2l - T2n;
|
||||
T2r = T1l - T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
|
||||
T1u = ri[WS(rs, 3)];
|
||||
T1x = ii[WS(rs, 3)];
|
||||
T1t = W[4];
|
||||
T1v = T1t * T1u;
|
||||
T2s = T1t * T1x;
|
||||
T1A = ri[WS(rs, 11)];
|
||||
T1D = ii[WS(rs, 11)];
|
||||
T1z = W[20];
|
||||
T1B = T1z * T1A;
|
||||
T2u = T1z * T1D;
|
||||
{
|
||||
E T1y, T2t, T1E, T2v, T1w, T1C;
|
||||
T1w = W[5];
|
||||
T1y = FMA(T1w, T1x, T1v);
|
||||
T2t = FNMS(T1w, T1u, T2s);
|
||||
T1C = W[21];
|
||||
T1E = FMA(T1C, T1D, T1B);
|
||||
T2v = FNMS(T1C, T1A, T2u);
|
||||
T1F = T1y + T1E;
|
||||
T36 = T2t + T2v;
|
||||
T2p = T1y - T1E;
|
||||
T2w = T2t - T2v;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
|
||||
Ta = ri[WS(rs, 4)];
|
||||
Td = ii[WS(rs, 4)];
|
||||
T9 = W[6];
|
||||
Tb = T9 * Ta;
|
||||
T1J = T9 * Td;
|
||||
Tg = ri[WS(rs, 12)];
|
||||
Tj = ii[WS(rs, 12)];
|
||||
Tf = W[22];
|
||||
Th = Tf * Tg;
|
||||
T1L = Tf * Tj;
|
||||
{
|
||||
E Te, T1K, Tk, T1M, Tc, Ti;
|
||||
Tc = W[7];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1K = FNMS(Tc, Ta, T1J);
|
||||
Ti = W[23];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1M = FNMS(Ti, Tg, T1L);
|
||||
Tl = Te + Tk;
|
||||
T3A = Te - Tk;
|
||||
T1N = T1K - T1M;
|
||||
T3k = T1K + T1M;
|
||||
}
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
|
||||
To = ri[WS(rs, 2)];
|
||||
Tr = ii[WS(rs, 2)];
|
||||
Tn = W[2];
|
||||
Tp = Tn * To;
|
||||
T1P = Tn * Tr;
|
||||
Tu = ri[WS(rs, 10)];
|
||||
Tx = ii[WS(rs, 10)];
|
||||
Tt = W[18];
|
||||
Tv = Tt * Tu;
|
||||
T1R = Tt * Tx;
|
||||
{
|
||||
E Ts, T1Q, Ty, T1S, Tq, Tw;
|
||||
Tq = W[3];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
T1Q = FNMS(Tq, To, T1P);
|
||||
Tw = W[19];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
T1S = FNMS(Tw, Tu, T1R);
|
||||
Tz = Ts + Ty;
|
||||
T2V = T1Q + T1S;
|
||||
T1T = T1Q - T1S;
|
||||
T1U = Ts - Ty;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
|
||||
TQ = ri[WS(rs, 1)];
|
||||
TT = ii[WS(rs, 1)];
|
||||
TP = W[0];
|
||||
TR = TP * TQ;
|
||||
T25 = TP * TT;
|
||||
TW = ri[WS(rs, 9)];
|
||||
TZ = ii[WS(rs, 9)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T27 = TV * TZ;
|
||||
{
|
||||
E TU, T26, T10, T28, TS, TY;
|
||||
TS = W[1];
|
||||
TU = FMA(TS, TT, TR);
|
||||
T26 = FNMS(TS, TQ, T25);
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T28 = FNMS(TY, TW, T27);
|
||||
T11 = TU + T10;
|
||||
T30 = T26 + T28;
|
||||
T29 = T26 - T28;
|
||||
T2c = TU - T10;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
|
||||
T13 = ri[WS(rs, 5)];
|
||||
T16 = ii[WS(rs, 5)];
|
||||
T12 = W[8];
|
||||
T14 = T12 * T13;
|
||||
T2d = T12 * T16;
|
||||
T19 = ri[WS(rs, 13)];
|
||||
T1c = ii[WS(rs, 13)];
|
||||
T18 = W[24];
|
||||
T1a = T18 * T19;
|
||||
T2f = T18 * T1c;
|
||||
{
|
||||
E T17, T2e, T1d, T2g, T15, T1b;
|
||||
T15 = W[9];
|
||||
T17 = FMA(T15, T16, T14);
|
||||
T2e = FNMS(T15, T13, T2d);
|
||||
T1b = W[25];
|
||||
T1d = FMA(T1b, T1c, T1a);
|
||||
T2g = FNMS(T1b, T19, T2f);
|
||||
T1e = T17 + T1d;
|
||||
T31 = T2e + T2g;
|
||||
T2a = T17 - T1d;
|
||||
T2h = T2e - T2g;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
|
||||
TB = ri[WS(rs, 14)];
|
||||
TE = ii[WS(rs, 14)];
|
||||
TA = W[26];
|
||||
TC = TA * TB;
|
||||
T1X = TA * TE;
|
||||
TH = ri[WS(rs, 6)];
|
||||
TK = ii[WS(rs, 6)];
|
||||
TG = W[10];
|
||||
TI = TG * TH;
|
||||
T1Z = TG * TK;
|
||||
{
|
||||
E TF, T1Y, TL, T20, TD, TJ;
|
||||
TD = W[27];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1Y = FNMS(TD, TB, T1X);
|
||||
TJ = W[11];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T20 = FNMS(TJ, TH, T1Z);
|
||||
TM = TF + TL;
|
||||
T2W = T1Y + T20;
|
||||
T1W = TF - TL;
|
||||
T21 = T1Y - T20;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
|
||||
{
|
||||
E Tm, TN, T3j, T3p;
|
||||
Tm = T8 + Tl;
|
||||
TN = Tz + TM;
|
||||
TO = Tm + TN;
|
||||
T3e = Tm - TN;
|
||||
T3j = T2V + T2W;
|
||||
T3p = T3k + T3o;
|
||||
T3q = T3j + T3p;
|
||||
T3s = T3p - T3j;
|
||||
}
|
||||
{
|
||||
E T1f, T1G, T3f, T3g;
|
||||
T1f = T11 + T1e;
|
||||
T1G = T1s + T1F;
|
||||
T1H = T1f + T1G;
|
||||
T3r = T1G - T1f;
|
||||
T3f = T30 + T31;
|
||||
T3g = T35 + T36;
|
||||
T3h = T3f - T3g;
|
||||
T3i = T3f + T3g;
|
||||
}
|
||||
ri[WS(rs, 8)] = TO - T1H;
|
||||
ii[WS(rs, 8)] = T3q - T3i;
|
||||
ri[0] = TO + T1H;
|
||||
ii[0] = T3i + T3q;
|
||||
ri[WS(rs, 12)] = T3e - T3h;
|
||||
ii[WS(rs, 12)] = T3s - T3r;
|
||||
ri[WS(rs, 4)] = T3e + T3h;
|
||||
ii[WS(rs, 4)] = T3r + T3s;
|
||||
}
|
||||
{
|
||||
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
|
||||
{
|
||||
E T2U, T2X, T3t, T3u;
|
||||
T2U = T8 - Tl;
|
||||
T2X = T2V - T2W;
|
||||
T2Y = T2U + T2X;
|
||||
T3a = T2U - T2X;
|
||||
T3t = TM - Tz;
|
||||
T3u = T3o - T3k;
|
||||
T3v = T3t + T3u;
|
||||
T3x = T3u - T3t;
|
||||
}
|
||||
{
|
||||
E T2Z, T32, T34, T37;
|
||||
T2Z = T11 - T1e;
|
||||
T32 = T30 - T31;
|
||||
T33 = T2Z + T32;
|
||||
T3b = T32 - T2Z;
|
||||
T34 = T1s - T1F;
|
||||
T37 = T35 - T36;
|
||||
T38 = T34 - T37;
|
||||
T3c = T34 + T37;
|
||||
}
|
||||
{
|
||||
E T39, T3w, T3d, T3y;
|
||||
T39 = T33 + T38;
|
||||
ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
|
||||
ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
|
||||
T3w = T3b + T3c;
|
||||
ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
|
||||
ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
|
||||
T3d = T3b - T3c;
|
||||
ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
|
||||
ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
|
||||
T3y = T38 - T33;
|
||||
ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
|
||||
ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
|
||||
E T2C;
|
||||
{
|
||||
E T1V, T22, T2b, T2i;
|
||||
T1O = T1I - T1N;
|
||||
T3B = T3z - T3A;
|
||||
T3H = T3A + T3z;
|
||||
T2E = T1I + T1N;
|
||||
T1V = T1T - T1U;
|
||||
T22 = T1W + T21;
|
||||
T23 = T1V - T22;
|
||||
T3C = T1V + T22;
|
||||
{
|
||||
E T2M, T2N, T2F, T2G;
|
||||
T2M = T2r + T2w;
|
||||
T2N = T2o - T2p;
|
||||
T2O = FNMS(KP414213562, T2N, T2M);
|
||||
T2S = FMA(KP414213562, T2M, T2N);
|
||||
T2F = T1U + T1T;
|
||||
T2G = T1W - T21;
|
||||
T2H = T2F + T2G;
|
||||
T3I = T2G - T2F;
|
||||
}
|
||||
T2b = T29 + T2a;
|
||||
T2i = T2c - T2h;
|
||||
T2j = FMA(KP414213562, T2i, T2b);
|
||||
T2B = FNMS(KP414213562, T2b, T2i);
|
||||
{
|
||||
E T2J, T2K, T2q, T2x;
|
||||
T2J = T2c + T2h;
|
||||
T2K = T29 - T2a;
|
||||
T2L = FMA(KP414213562, T2K, T2J);
|
||||
T2R = FNMS(KP414213562, T2J, T2K);
|
||||
T2q = T2o + T2p;
|
||||
T2x = T2r - T2w;
|
||||
T2y = FNMS(KP414213562, T2x, T2q);
|
||||
T2C = FMA(KP414213562, T2q, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T24, T2z, T3J, T3K;
|
||||
T24 = FMA(KP707106781, T23, T1O);
|
||||
T2z = T2j - T2y;
|
||||
ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
|
||||
ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
|
||||
T3J = FMA(KP707106781, T3I, T3H);
|
||||
T3K = T2C - T2B;
|
||||
ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
|
||||
ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
|
||||
}
|
||||
{
|
||||
E T2A, T2D, T3L, T3M;
|
||||
T2A = FNMS(KP707106781, T23, T1O);
|
||||
T2D = T2B + T2C;
|
||||
ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
|
||||
ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
|
||||
T3L = FNMS(KP707106781, T3I, T3H);
|
||||
T3M = T2j + T2y;
|
||||
ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
|
||||
ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
|
||||
}
|
||||
{
|
||||
E T2I, T2P, T3D, T3E;
|
||||
T2I = FMA(KP707106781, T2H, T2E);
|
||||
T2P = T2L + T2O;
|
||||
ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
|
||||
ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
|
||||
T3D = FMA(KP707106781, T3C, T3B);
|
||||
T3E = T2R + T2S;
|
||||
ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
|
||||
ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
|
||||
}
|
||||
{
|
||||
E T2Q, T2T, T3F, T3G;
|
||||
T2Q = FNMS(KP707106781, T2H, T2E);
|
||||
T2T = T2R - T2S;
|
||||
ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
|
||||
ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
|
||||
T3F = FNMS(KP707106781, T3C, T3B);
|
||||
T3G = T2O - T2L;
|
||||
ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
|
||||
ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 104, 30, 70, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 84 FP multiplications,
|
||||
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 52 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
|
||||
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
|
||||
E T2y, T2z, T1O, T2g, T1T, T2h;
|
||||
{
|
||||
E T1, T2T, T6, T2S;
|
||||
T1 = ri[0];
|
||||
T2T = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 8)];
|
||||
T5 = ii[WS(rs, 8)];
|
||||
T2 = W[14];
|
||||
T4 = W[15];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T2S = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T37 = T2T - T2S;
|
||||
T1t = T1 - T6;
|
||||
T2U = T2S + T2T;
|
||||
}
|
||||
{
|
||||
E Tc, T1u, Th, T1v;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 4)];
|
||||
Tb = ii[WS(rs, 4)];
|
||||
T8 = W[6];
|
||||
Ta = W[7];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T1u = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 12)];
|
||||
Tg = ii[WS(rs, 12)];
|
||||
Td = W[22];
|
||||
Tf = W[23];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T1v = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T38 = Tc - Th;
|
||||
T1w = T1u - T1v;
|
||||
T2R = T1u + T1v;
|
||||
}
|
||||
{
|
||||
E To, T1y, Tt, T1z, T1A, T1B;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = ri[WS(rs, 2)];
|
||||
Tn = ii[WS(rs, 2)];
|
||||
Tk = W[2];
|
||||
Tm = W[3];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
T1y = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = ri[WS(rs, 10)];
|
||||
Ts = ii[WS(rs, 10)];
|
||||
Tp = W[18];
|
||||
Tr = W[19];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
T1z = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T2s = T1y + T1z;
|
||||
T1A = T1y - T1z;
|
||||
T1B = To - Tt;
|
||||
T1C = T1A - T1B;
|
||||
T2c = T1B + T1A;
|
||||
}
|
||||
{
|
||||
E Tz, T1E, TE, T1F, T1D, T1G;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 14)];
|
||||
Ty = ii[WS(rs, 14)];
|
||||
Tv = W[26];
|
||||
Tx = W[27];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1E = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 6)];
|
||||
TD = ii[WS(rs, 6)];
|
||||
TA = W[10];
|
||||
TC = W[11];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1F = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T2t = T1E + T1F;
|
||||
T1D = Tz - TE;
|
||||
T1G = T1E - T1F;
|
||||
T1H = T1D + T1G;
|
||||
T2d = T1D - T1G;
|
||||
}
|
||||
{
|
||||
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
|
||||
{
|
||||
E T16, T18, T15, T17;
|
||||
T16 = ri[WS(rs, 15)];
|
||||
T18 = ii[WS(rs, 15)];
|
||||
T15 = W[28];
|
||||
T17 = W[29];
|
||||
T19 = FMA(T15, T16, T17 * T18);
|
||||
T20 = FNMS(T17, T16, T15 * T18);
|
||||
}
|
||||
{
|
||||
E T1m, T1o, T1l, T1n;
|
||||
T1m = ri[WS(rs, 11)];
|
||||
T1o = ii[WS(rs, 11)];
|
||||
T1l = W[20];
|
||||
T1n = W[21];
|
||||
T1p = FMA(T1l, T1m, T1n * T1o);
|
||||
T1X = FNMS(T1n, T1m, T1l * T1o);
|
||||
}
|
||||
{
|
||||
E T1b, T1d, T1a, T1c;
|
||||
T1b = ri[WS(rs, 7)];
|
||||
T1d = ii[WS(rs, 7)];
|
||||
T1a = W[12];
|
||||
T1c = W[13];
|
||||
T1e = FMA(T1a, T1b, T1c * T1d);
|
||||
T21 = FNMS(T1c, T1b, T1a * T1d);
|
||||
}
|
||||
{
|
||||
E T1h, T1j, T1g, T1i;
|
||||
T1h = ri[WS(rs, 3)];
|
||||
T1j = ii[WS(rs, 3)];
|
||||
T1g = W[4];
|
||||
T1i = W[5];
|
||||
T1k = FMA(T1g, T1h, T1i * T1j);
|
||||
T1W = FNMS(T1i, T1h, T1g * T1j);
|
||||
}
|
||||
T1f = T19 + T1e;
|
||||
T1q = T1k + T1p;
|
||||
T2B = T1f - T1q;
|
||||
T2C = T20 + T21;
|
||||
T2D = T1W + T1X;
|
||||
T2E = T2C - T2D;
|
||||
{
|
||||
E T1V, T1Y, T22, T23;
|
||||
T1V = T19 - T1e;
|
||||
T1Y = T1W - T1X;
|
||||
T1Z = T1V - T1Y;
|
||||
T2j = T1V + T1Y;
|
||||
T22 = T20 - T21;
|
||||
T23 = T1k - T1p;
|
||||
T24 = T22 + T23;
|
||||
T2k = T22 - T23;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
|
||||
{
|
||||
E TJ, TL, TI, TK;
|
||||
TJ = ri[WS(rs, 1)];
|
||||
TL = ii[WS(rs, 1)];
|
||||
TI = W[0];
|
||||
TK = W[1];
|
||||
TM = FMA(TI, TJ, TK * TL);
|
||||
T1K = FNMS(TK, TJ, TI * TL);
|
||||
}
|
||||
{
|
||||
E TZ, T11, TY, T10;
|
||||
TZ = ri[WS(rs, 13)];
|
||||
T11 = ii[WS(rs, 13)];
|
||||
TY = W[24];
|
||||
T10 = W[25];
|
||||
T12 = FMA(TY, TZ, T10 * T11);
|
||||
T1R = FNMS(T10, TZ, TY * T11);
|
||||
}
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = ri[WS(rs, 9)];
|
||||
TQ = ii[WS(rs, 9)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1L = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TU, TW, TT, TV;
|
||||
TU = ri[WS(rs, 5)];
|
||||
TW = ii[WS(rs, 5)];
|
||||
TT = W[8];
|
||||
TV = W[9];
|
||||
TX = FMA(TT, TU, TV * TW);
|
||||
T1Q = FNMS(TV, TU, TT * TW);
|
||||
}
|
||||
TS = TM + TR;
|
||||
T13 = TX + T12;
|
||||
T2w = TS - T13;
|
||||
T2x = T1K + T1L;
|
||||
T2y = T1Q + T1R;
|
||||
T2z = T2x - T2y;
|
||||
{
|
||||
E T1M, T1N, T1P, T1S;
|
||||
T1M = T1K - T1L;
|
||||
T1N = TX - T12;
|
||||
T1O = T1M + T1N;
|
||||
T2g = T1M - T1N;
|
||||
T1P = TM - TR;
|
||||
T1S = T1Q - T1R;
|
||||
T1T = T1P - T1S;
|
||||
T2h = T1P + T1S;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
|
||||
{
|
||||
E T1x, T1I, T3e, T3f;
|
||||
T1x = T1t - T1w;
|
||||
T1I = KP707106781 * (T1C - T1H);
|
||||
T1J = T1x + T1I;
|
||||
T27 = T1x - T1I;
|
||||
T3e = KP707106781 * (T2d - T2c);
|
||||
T3f = T38 + T37;
|
||||
T3g = T3e + T3f;
|
||||
T3i = T3f - T3e;
|
||||
}
|
||||
{
|
||||
E T1U, T25, T28, T29;
|
||||
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
|
||||
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
|
||||
T26 = T1U + T25;
|
||||
T3h = T25 - T1U;
|
||||
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
|
||||
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
|
||||
T2a = T28 - T29;
|
||||
T3d = T28 + T29;
|
||||
}
|
||||
ri[WS(rs, 11)] = T1J - T26;
|
||||
ii[WS(rs, 11)] = T3g - T3d;
|
||||
ri[WS(rs, 3)] = T1J + T26;
|
||||
ii[WS(rs, 3)] = T3d + T3g;
|
||||
ri[WS(rs, 15)] = T27 - T2a;
|
||||
ii[WS(rs, 15)] = T3i - T3h;
|
||||
ri[WS(rs, 7)] = T27 + T2a;
|
||||
ii[WS(rs, 7)] = T3h + T3i;
|
||||
}
|
||||
{
|
||||
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
|
||||
{
|
||||
E T2r, T2u, T30, T31;
|
||||
T2r = T7 - Ti;
|
||||
T2u = T2s - T2t;
|
||||
T2v = T2r + T2u;
|
||||
T2H = T2r - T2u;
|
||||
T30 = TF - Tu;
|
||||
T31 = T2U - T2R;
|
||||
T32 = T30 + T31;
|
||||
T34 = T31 - T30;
|
||||
}
|
||||
{
|
||||
E T2A, T2F, T2I, T2J;
|
||||
T2A = T2w + T2z;
|
||||
T2F = T2B - T2E;
|
||||
T2G = KP707106781 * (T2A + T2F);
|
||||
T33 = KP707106781 * (T2F - T2A);
|
||||
T2I = T2z - T2w;
|
||||
T2J = T2B + T2E;
|
||||
T2K = KP707106781 * (T2I - T2J);
|
||||
T2Z = KP707106781 * (T2I + T2J);
|
||||
}
|
||||
ri[WS(rs, 10)] = T2v - T2G;
|
||||
ii[WS(rs, 10)] = T32 - T2Z;
|
||||
ri[WS(rs, 2)] = T2v + T2G;
|
||||
ii[WS(rs, 2)] = T2Z + T32;
|
||||
ri[WS(rs, 14)] = T2H - T2K;
|
||||
ii[WS(rs, 14)] = T34 - T33;
|
||||
ri[WS(rs, 6)] = T2H + T2K;
|
||||
ii[WS(rs, 6)] = T33 + T34;
|
||||
}
|
||||
{
|
||||
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
|
||||
{
|
||||
E T2b, T2e, T36, T39;
|
||||
T2b = T1t + T1w;
|
||||
T2e = KP707106781 * (T2c + T2d);
|
||||
T2f = T2b + T2e;
|
||||
T2n = T2b - T2e;
|
||||
T36 = KP707106781 * (T1C + T1H);
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 + T39;
|
||||
T3c = T39 - T36;
|
||||
}
|
||||
{
|
||||
E T2i, T2l, T2o, T2p;
|
||||
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
|
||||
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
|
||||
T2m = T2i + T2l;
|
||||
T3b = T2l - T2i;
|
||||
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
|
||||
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
|
||||
T2q = T2o - T2p;
|
||||
T35 = T2o + T2p;
|
||||
}
|
||||
ri[WS(rs, 9)] = T2f - T2m;
|
||||
ii[WS(rs, 9)] = T3a - T35;
|
||||
ri[WS(rs, 1)] = T2f + T2m;
|
||||
ii[WS(rs, 1)] = T35 + T3a;
|
||||
ri[WS(rs, 13)] = T2n - T2q;
|
||||
ii[WS(rs, 13)] = T3c - T3b;
|
||||
ri[WS(rs, 5)] = T2n + T2q;
|
||||
ii[WS(rs, 5)] = T3b + T3c;
|
||||
}
|
||||
{
|
||||
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
|
||||
{
|
||||
E Tj, TG, T2Q, T2V;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
TH = Tj + TG;
|
||||
T2L = Tj - TG;
|
||||
T2Q = T2s + T2t;
|
||||
T2V = T2R + T2U;
|
||||
T2W = T2Q + T2V;
|
||||
T2Y = T2V - T2Q;
|
||||
}
|
||||
{
|
||||
E T14, T1r, T2M, T2N;
|
||||
T14 = TS + T13;
|
||||
T1r = T1f + T1q;
|
||||
T1s = T14 + T1r;
|
||||
T2X = T1r - T14;
|
||||
T2M = T2x + T2y;
|
||||
T2N = T2C + T2D;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2M + T2N;
|
||||
}
|
||||
ri[WS(rs, 8)] = TH - T1s;
|
||||
ii[WS(rs, 8)] = T2W - T2P;
|
||||
ri[0] = TH + T1s;
|
||||
ii[0] = T2P + T2W;
|
||||
ri[WS(rs, 12)] = T2L - T2O;
|
||||
ii[WS(rs, 12)] = T2Y - T2X;
|
||||
ri[WS(rs, 4)] = T2L + T2O;
|
||||
ii[WS(rs, 4)] = T2X + T2Y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 136, 46, 38, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_16, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 11 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
|
||||
T1 = ri[0];
|
||||
Ta = ii[0];
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
T8 = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T9 = FNMS(T5, T3, T8);
|
||||
ri[WS(rs, 1)] = T1 - T7;
|
||||
ii[WS(rs, 1)] = Ta - T9;
|
||||
ri[0] = T1 + T7;
|
||||
ii[0] = T9 + Ta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_2) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_2, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, T8, T6, T7;
|
||||
T1 = ri[0];
|
||||
T8 = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T7 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
ri[WS(rs, 1)] = T1 - T6;
|
||||
ii[WS(rs, 1)] = T8 - T7;
|
||||
ri[0] = T1 + T6;
|
||||
ii[0] = T7 + T8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_2) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_2, &desc);
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 14 FP multiplications,
|
||||
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, Tm, T7, Th, Td, Tj;
|
||||
T1 = ri[0];
|
||||
Tm = ii[0];
|
||||
{
|
||||
E T3, T6, T4, Tg, T2, T5;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
Tg = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Th = FNMS(T5, T3, Tg);
|
||||
}
|
||||
{
|
||||
E T9, Tc, Ta, Ti, T8, Tb;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tc = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = T8 * T9;
|
||||
Ti = T8 * Tc;
|
||||
Tb = W[3];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
Tj = FNMS(Tb, T9, Ti);
|
||||
}
|
||||
{
|
||||
E Tk, Te, Tf, To, Tl, Tn;
|
||||
Tk = Th - Tj;
|
||||
Te = T7 + Td;
|
||||
Tf = FNMS(KP500000000, Te, T1);
|
||||
ri[0] = T1 + Te;
|
||||
ri[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
|
||||
ri[WS(rs, 2)] = FNMS(KP866025403, Tk, Tf);
|
||||
To = Td - T7;
|
||||
Tl = Th + Tj;
|
||||
Tn = FNMS(KP500000000, Tl, Tm);
|
||||
ii[0] = Tl + Tm;
|
||||
ii[WS(rs, 2)] = FNMS(KP866025403, To, Tn);
|
||||
ii[WS(rs, 1)] = FMA(KP866025403, To, Tn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 6, 4, 10, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_3) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_3, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 12 FP multiplications,
|
||||
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, Ti, T6, Te, Tb, Tf, Tc, Th;
|
||||
T1 = ri[0];
|
||||
Ti = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Te = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 2)];
|
||||
Ta = ii[WS(rs, 2)];
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Tf = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
Tc = T6 + Tb;
|
||||
Th = Te + Tf;
|
||||
ri[0] = T1 + Tc;
|
||||
ii[0] = Th + Ti;
|
||||
{
|
||||
E Td, Tg, Tj, Tk;
|
||||
Td = FNMS(KP500000000, Tc, T1);
|
||||
Tg = KP866025403 * (Te - Tf);
|
||||
ri[WS(rs, 2)] = Td - Tg;
|
||||
ri[WS(rs, 1)] = Td + Tg;
|
||||
Tj = KP866025403 * (Tb - T6);
|
||||
Tk = FNMS(KP500000000, Th, Ti);
|
||||
ii[WS(rs, 1)] = Tj + Tk;
|
||||
ii[WS(rs, 2)] = Tk - Tj;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 10, 6, 6, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_3) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_3, &desc);
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
|
||||
T1 = ri[0];
|
||||
Tv = ii[0];
|
||||
{
|
||||
E T3, T6, T4, Tt, T2, T5;
|
||||
T3 = ri[WS(rs, 2)];
|
||||
T6 = ii[WS(rs, 2)];
|
||||
T2 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tt = T2 * T6;
|
||||
T5 = W[3];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tu = FNMS(T5, T3, Tt);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, Tn, T9, Tc;
|
||||
Ta = ri[WS(rs, 1)];
|
||||
Td = ii[WS(rs, 1)];
|
||||
T9 = W[0];
|
||||
Tb = T9 * Ta;
|
||||
Tn = T9 * Td;
|
||||
Tc = W[1];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
To = FNMS(Tc, Ta, Tn);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, Tp, Tf, Ti;
|
||||
Tg = ri[WS(rs, 3)];
|
||||
Tj = ii[WS(rs, 3)];
|
||||
Tf = W[4];
|
||||
Th = Tf * Tg;
|
||||
Tp = Tf * Tj;
|
||||
Ti = W[5];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
Tq = FNMS(Ti, Tg, Tp);
|
||||
}
|
||||
{
|
||||
E T8, Tl, Ts, Tw;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
ri[WS(rs, 2)] = T8 - Tl;
|
||||
ri[0] = T8 + Tl;
|
||||
Ts = To + Tq;
|
||||
Tw = Tu + Tv;
|
||||
ii[0] = Ts + Tw;
|
||||
ii[WS(rs, 2)] = Tw - Ts;
|
||||
}
|
||||
{
|
||||
E Tm, Tr, Tx, Ty;
|
||||
Tm = T1 - T7;
|
||||
Tr = To - Tq;
|
||||
ri[WS(rs, 3)] = Tm - Tr;
|
||||
ri[WS(rs, 1)] = Tm + Tr;
|
||||
Tx = Tv - Tu;
|
||||
Ty = Te - Tk;
|
||||
ii[WS(rs, 1)] = Tx - Ty;
|
||||
ii[WS(rs, 3)] = Ty + Tx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
|
||||
T1 = ri[0];
|
||||
Tp = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 2)];
|
||||
T5 = ii[WS(rs, 2)];
|
||||
T2 = W[2];
|
||||
T4 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
To = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 1)];
|
||||
Tb = ii[WS(rs, 1)];
|
||||
T8 = W[0];
|
||||
Ta = W[1];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
Tk = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 3)];
|
||||
Tg = ii[WS(rs, 3)];
|
||||
Td = W[4];
|
||||
Tf = W[5];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tl = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E T7, Ti, Tn, Tq;
|
||||
T7 = T1 + T6;
|
||||
Ti = Tc + Th;
|
||||
ri[WS(rs, 2)] = T7 - Ti;
|
||||
ri[0] = T7 + Ti;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
ii[0] = Tn + Tq;
|
||||
ii[WS(rs, 2)] = Tq - Tn;
|
||||
}
|
||||
{
|
||||
E Tj, Tm, Tr, Ts;
|
||||
Tj = T1 - T6;
|
||||
Tm = Tk - Tl;
|
||||
ri[WS(rs, 3)] = Tj - Tm;
|
||||
ri[WS(rs, 1)] = Tj + Tm;
|
||||
Tr = Tp - To;
|
||||
Ts = Tc - Th;
|
||||
ii[WS(rs, 1)] = Tr - Ts;
|
||||
ii[WS(rs, 3)] = Ts + Tr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_4, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 34 FP multiplications,
|
||||
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
|
||||
* 31 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, TM, T7, Tx, Td, Tz, Te, TJ, Tk, TC, Tq, TE, Tr, TK;
|
||||
T1 = ri[0];
|
||||
TM = ii[0];
|
||||
{
|
||||
E T3, T6, T4, Tw, T9, Tc, Ta, Ty, T2, T8, T5, Tb;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
Tw = T2 * T6;
|
||||
T9 = ri[WS(rs, 4)];
|
||||
Tc = ii[WS(rs, 4)];
|
||||
T8 = W[6];
|
||||
Ta = T8 * T9;
|
||||
Ty = T8 * Tc;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tx = FNMS(T5, T3, Tw);
|
||||
Tb = W[7];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
Tz = FNMS(Tb, T9, Ty);
|
||||
Te = T7 + Td;
|
||||
TJ = Tx + Tz;
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TB, Tm, Tp, Tn, TD, Tf, Tl, Ti, To;
|
||||
Tg = ri[WS(rs, 2)];
|
||||
Tj = ii[WS(rs, 2)];
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
TB = Tf * Tj;
|
||||
Tm = ri[WS(rs, 3)];
|
||||
Tp = ii[WS(rs, 3)];
|
||||
Tl = W[4];
|
||||
Tn = Tl * Tm;
|
||||
TD = Tl * Tp;
|
||||
Ti = W[3];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TC = FNMS(Ti, Tg, TB);
|
||||
To = W[5];
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
TE = FNMS(To, Tm, TD);
|
||||
Tr = Tk + Tq;
|
||||
TK = TC + TE;
|
||||
}
|
||||
{
|
||||
E Tu, Ts, Tt, TG, TI, TA, TF, TH, Tv;
|
||||
Tu = Te - Tr;
|
||||
Ts = Te + Tr;
|
||||
Tt = FNMS(KP250000000, Ts, T1);
|
||||
TA = Tx - Tz;
|
||||
TF = TC - TE;
|
||||
TG = FMA(KP618033988, TF, TA);
|
||||
TI = FNMS(KP618033988, TA, TF);
|
||||
ri[0] = T1 + Ts;
|
||||
TH = FNMS(KP559016994, Tu, Tt);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, TI, TH);
|
||||
Tv = FMA(KP559016994, Tu, Tt);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, TG, Tv);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
|
||||
}
|
||||
{
|
||||
E TO, TL, TN, TS, TU, TQ, TR, TT, TP;
|
||||
TO = TJ - TK;
|
||||
TL = TJ + TK;
|
||||
TN = FNMS(KP250000000, TL, TM);
|
||||
TQ = T7 - Td;
|
||||
TR = Tk - Tq;
|
||||
TS = FMA(KP618033988, TR, TQ);
|
||||
TU = FNMS(KP618033988, TQ, TR);
|
||||
ii[0] = TL + TM;
|
||||
TT = FNMS(KP559016994, TO, TN);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, TU, TT);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, TU, TT);
|
||||
TP = FMA(KP559016994, TO, TN);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, TS, TP);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, TS, TP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 14, 8, 26, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 28 FP multiplications,
|
||||
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 29 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, TE, Tu, Tx, TJ, TI, TB, TC, TD, Tc, Tn, To;
|
||||
T1 = ri[0];
|
||||
TE = ii[0];
|
||||
{
|
||||
E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Ts = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E Tj, Tl, Ti, Tk;
|
||||
Tj = ri[WS(rs, 3)];
|
||||
Tl = ii[WS(rs, 3)];
|
||||
Ti = W[4];
|
||||
Tk = W[5];
|
||||
Tm = FMA(Ti, Tj, Tk * Tl);
|
||||
Tw = FNMS(Tk, Tj, Ti * Tl);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 4)];
|
||||
Ta = ii[WS(rs, 4)];
|
||||
T7 = W[6];
|
||||
T9 = W[7];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Tt = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 2)];
|
||||
Tg = ii[WS(rs, 2)];
|
||||
Td = W[2];
|
||||
Tf = W[3];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tv = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Tu = Ts - Tt;
|
||||
Tx = Tv - Tw;
|
||||
TJ = Th - Tm;
|
||||
TI = T6 - Tb;
|
||||
TB = Ts + Tt;
|
||||
TC = Tv + Tw;
|
||||
TD = TB + TC;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Th + Tm;
|
||||
To = Tc + Tn;
|
||||
}
|
||||
ri[0] = T1 + To;
|
||||
ii[0] = TD + TE;
|
||||
{
|
||||
E Ty, TA, Tr, Tz, Tp, Tq;
|
||||
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
|
||||
TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
|
||||
Tp = KP559016994 * (Tc - Tn);
|
||||
Tq = FNMS(KP250000000, To, T1);
|
||||
Tr = Tp + Tq;
|
||||
Tz = Tq - Tp;
|
||||
ri[WS(rs, 4)] = Tr - Ty;
|
||||
ri[WS(rs, 3)] = Tz + TA;
|
||||
ri[WS(rs, 1)] = Tr + Ty;
|
||||
ri[WS(rs, 2)] = Tz - TA;
|
||||
}
|
||||
{
|
||||
E TK, TL, TH, TM, TF, TG;
|
||||
TK = FMA(KP951056516, TI, KP587785252 * TJ);
|
||||
TL = FNMS(KP587785252, TI, KP951056516 * TJ);
|
||||
TF = KP559016994 * (TB - TC);
|
||||
TG = FNMS(KP250000000, TD, TE);
|
||||
TH = TF + TG;
|
||||
TM = TG - TF;
|
||||
ii[WS(rs, 1)] = TH - TK;
|
||||
ii[WS(rs, 3)] = TM - TL;
|
||||
ii[WS(rs, 4)] = TK + TH;
|
||||
ii[WS(rs, 2)] = TL + TM;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 26, 14, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_5, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,295 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 32 FP multiplications,
|
||||
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T1, TX, T7, TW, Tl, TR, TB, TJ, Ty, TS, TC, TO;
|
||||
T1 = ri[0];
|
||||
TX = ii[0];
|
||||
{
|
||||
E T3, T6, T4, TV, T2, T5;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T6 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TV = T2 * T6;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TW = FNMS(T5, T3, TV);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TF, Tg, Tj, Th, TH, T9, Tf;
|
||||
Ta = ri[WS(rs, 2)];
|
||||
Td = ii[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TF = T9 * Td;
|
||||
Tg = ri[WS(rs, 5)];
|
||||
Tj = ii[WS(rs, 5)];
|
||||
Tf = W[8];
|
||||
Th = Tf * Tg;
|
||||
TH = Tf * Tj;
|
||||
{
|
||||
E Te, TG, Tk, TI, Tc, Ti;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TG = FNMS(Tc, Ta, TF);
|
||||
Ti = W[9];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TI = FNMS(Ti, Tg, TH);
|
||||
Tl = Te - Tk;
|
||||
TR = TG + TI;
|
||||
TB = Te + Tk;
|
||||
TJ = TG - TI;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, TK, Tt, Tw, Tu, TM, Tm, Ts;
|
||||
Tn = ri[WS(rs, 4)];
|
||||
Tq = ii[WS(rs, 4)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
TK = Tm * Tq;
|
||||
Tt = ri[WS(rs, 1)];
|
||||
Tw = ii[WS(rs, 1)];
|
||||
Ts = W[0];
|
||||
Tu = Ts * Tt;
|
||||
TM = Ts * Tw;
|
||||
{
|
||||
E Tr, TL, Tx, TN, Tp, Tv;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
TL = FNMS(Tp, Tn, TK);
|
||||
Tv = W[1];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TN = FNMS(Tv, Tt, TM);
|
||||
Ty = Tr - Tx;
|
||||
TS = TL + TN;
|
||||
TC = Tr + Tx;
|
||||
TO = TL - TN;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TP, T8, Tz, TE;
|
||||
TP = TJ - TO;
|
||||
T8 = T1 - T7;
|
||||
Tz = Tl + Ty;
|
||||
TE = FNMS(KP500000000, Tz, T8);
|
||||
ri[WS(rs, 3)] = T8 + Tz;
|
||||
ri[WS(rs, 1)] = FMA(KP866025403, TP, TE);
|
||||
ri[WS(rs, 5)] = FNMS(KP866025403, TP, TE);
|
||||
}
|
||||
{
|
||||
E T14, T11, T12, T13;
|
||||
T14 = Ty - Tl;
|
||||
T11 = TX - TW;
|
||||
T12 = TJ + TO;
|
||||
T13 = FNMS(KP500000000, T12, T11);
|
||||
ii[WS(rs, 1)] = FMA(KP866025403, T14, T13);
|
||||
ii[WS(rs, 3)] = T12 + T11;
|
||||
ii[WS(rs, 5)] = FNMS(KP866025403, T14, T13);
|
||||
}
|
||||
{
|
||||
E TT, TA, TD, TQ;
|
||||
TT = TR - TS;
|
||||
TA = T1 + T7;
|
||||
TD = TB + TC;
|
||||
TQ = FNMS(KP500000000, TD, TA);
|
||||
ri[0] = TA + TD;
|
||||
ri[WS(rs, 4)] = FMA(KP866025403, TT, TQ);
|
||||
ri[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
|
||||
}
|
||||
{
|
||||
E T10, TU, TY, TZ;
|
||||
T10 = TC - TB;
|
||||
TU = TR + TS;
|
||||
TY = TW + TX;
|
||||
TZ = FNMS(KP500000000, TU, TY);
|
||||
ii[0] = TU + TY;
|
||||
ii[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
|
||||
ii[WS(rs, 2)] = FNMS(KP866025403, T10, TZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 24, 10, 22, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_6) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_6, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 28 FP multiplications,
|
||||
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
|
||||
{
|
||||
E T1, TN, T6, TM;
|
||||
T1 = ri[0];
|
||||
TN = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T5 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TM = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
TS = TN - TM;
|
||||
Tv = T1 + T6;
|
||||
TO = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tn, TD, Ts, TE;
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 4)];
|
||||
Tm = ii[WS(rs, 4)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TD = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 1)];
|
||||
Tr = ii[WS(rs, 1)];
|
||||
To = W[0];
|
||||
Tq = W[1];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TE = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn - Ts;
|
||||
TJ = TD + TE;
|
||||
Tx = Tn + Ts;
|
||||
TF = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tc, TA, Th, TB;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tb = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TA = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 5)];
|
||||
Tg = ii[WS(rs, 5)];
|
||||
Td = W[8];
|
||||
Tf = W[9];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TB = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
TI = TA + TB;
|
||||
Tw = Tc + Th;
|
||||
TC = TA - TB;
|
||||
}
|
||||
{
|
||||
E TG, Tu, Tz, TR, TT, TU;
|
||||
TG = KP866025403 * (TC - TF);
|
||||
Tu = Ti + Tt;
|
||||
Tz = FNMS(KP500000000, Tu, T7);
|
||||
ri[WS(rs, 3)] = T7 + Tu;
|
||||
ri[WS(rs, 1)] = Tz + TG;
|
||||
ri[WS(rs, 5)] = Tz - TG;
|
||||
TR = KP866025403 * (Tt - Ti);
|
||||
TT = TC + TF;
|
||||
TU = FNMS(KP500000000, TT, TS);
|
||||
ii[WS(rs, 1)] = TR + TU;
|
||||
ii[WS(rs, 3)] = TT + TS;
|
||||
ii[WS(rs, 5)] = TU - TR;
|
||||
}
|
||||
{
|
||||
E TK, Ty, TH, TQ, TL, TP;
|
||||
TK = KP866025403 * (TI - TJ);
|
||||
Ty = Tw + Tx;
|
||||
TH = FNMS(KP500000000, Ty, Tv);
|
||||
ri[0] = Tv + Ty;
|
||||
ri[WS(rs, 4)] = TH + TK;
|
||||
ri[WS(rs, 2)] = TH - TK;
|
||||
TQ = KP866025403 * (Tx - Tw);
|
||||
TL = TI + TJ;
|
||||
TP = FNMS(KP500000000, TL, TO);
|
||||
ii[0] = TL + TO;
|
||||
ii[WS(rs, 4)] = TQ + TP;
|
||||
ii[WS(rs, 2)] = TP - TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 32, 14, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_6) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_6, &desc);
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,354 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 66 FP multiplications,
|
||||
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
|
||||
* 37 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, T1c, Te, T1h, TR, T19, Tr, T1g, TM, T1a, TE, T1i, TW, T1b;
|
||||
T1 = ri[0];
|
||||
T1c = ii[0];
|
||||
{
|
||||
E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
TN = T2 * T6;
|
||||
T9 = ri[WS(rs, 6)];
|
||||
Tc = ii[WS(rs, 6)];
|
||||
T8 = W[10];
|
||||
Ta = T8 * T9;
|
||||
TP = T8 * Tc;
|
||||
{
|
||||
E T7, TO, Td, TQ, T5, Tb;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TO = FNMS(T5, T3, TN);
|
||||
Tb = W[11];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
TQ = FNMS(Tb, T9, TP);
|
||||
Te = T7 + Td;
|
||||
T1h = Td - T7;
|
||||
TR = TO - TQ;
|
||||
T19 = TO + TQ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
|
||||
Tg = ri[WS(rs, 2)];
|
||||
Tj = ii[WS(rs, 2)];
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
TI = Tf * Tj;
|
||||
Tm = ri[WS(rs, 5)];
|
||||
Tp = ii[WS(rs, 5)];
|
||||
Tl = W[8];
|
||||
Tn = Tl * Tm;
|
||||
TK = Tl * Tp;
|
||||
{
|
||||
E Tk, TJ, Tq, TL, Ti, To;
|
||||
Ti = W[3];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TJ = FNMS(Ti, Tg, TI);
|
||||
To = W[9];
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
TL = FNMS(To, Tm, TK);
|
||||
Tr = Tk + Tq;
|
||||
T1g = Tq - Tk;
|
||||
TM = TJ - TL;
|
||||
T1a = TJ + TL;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
|
||||
Tt = ri[WS(rs, 3)];
|
||||
Tw = ii[WS(rs, 3)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
TS = Ts * Tw;
|
||||
Tz = ri[WS(rs, 4)];
|
||||
TC = ii[WS(rs, 4)];
|
||||
Ty = W[6];
|
||||
TA = Ty * Tz;
|
||||
TU = Ty * TC;
|
||||
{
|
||||
E Tx, TT, TD, TV, Tv, TB;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TT = FNMS(Tv, Tt, TS);
|
||||
TB = W[7];
|
||||
TD = FMA(TB, TC, TA);
|
||||
TV = FNMS(TB, Tz, TU);
|
||||
TE = Tx + TD;
|
||||
T1i = TD - Tx;
|
||||
TW = TT - TV;
|
||||
T1b = TT + TV;
|
||||
}
|
||||
}
|
||||
ri[0] = T1 + Te + Tr + TE;
|
||||
ii[0] = T19 + T1a + T1b + T1c;
|
||||
{
|
||||
E TG, TY, TF, TX, TH;
|
||||
TF = FNMS(KP356895867, Tr, Te);
|
||||
TG = FNMS(KP692021471, TF, TE);
|
||||
TX = FMA(KP554958132, TW, TR);
|
||||
TY = FMA(KP801937735, TX, TM);
|
||||
TH = FNMS(KP900968867, TG, T1);
|
||||
ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
|
||||
ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
|
||||
}
|
||||
{
|
||||
E T1e, T1k, T1d, T1j, T1f;
|
||||
T1d = FNMS(KP356895867, T1a, T19);
|
||||
T1e = FNMS(KP692021471, T1d, T1b);
|
||||
T1j = FMA(KP554958132, T1i, T1h);
|
||||
T1k = FMA(KP801937735, T1j, T1g);
|
||||
T1f = FNMS(KP900968867, T1e, T1c);
|
||||
ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
|
||||
ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
|
||||
}
|
||||
{
|
||||
E T10, T13, TZ, T12, T11;
|
||||
TZ = FNMS(KP356895867, Te, TE);
|
||||
T10 = FNMS(KP692021471, TZ, Tr);
|
||||
T12 = FMA(KP554958132, TM, TW);
|
||||
T13 = FNMS(KP801937735, T12, TR);
|
||||
T11 = FNMS(KP900968867, T10, T1);
|
||||
ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
|
||||
ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
|
||||
}
|
||||
{
|
||||
E T1m, T1p, T1l, T1o, T1n;
|
||||
T1l = FNMS(KP356895867, T19, T1b);
|
||||
T1m = FNMS(KP692021471, T1l, T1a);
|
||||
T1o = FMA(KP554958132, T1g, T1i);
|
||||
T1p = FNMS(KP801937735, T1o, T1h);
|
||||
T1n = FNMS(KP900968867, T1m, T1c);
|
||||
ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
|
||||
ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
|
||||
}
|
||||
{
|
||||
E T15, T18, T14, T17, T16;
|
||||
T14 = FNMS(KP356895867, TE, Tr);
|
||||
T15 = FNMS(KP692021471, T14, Te);
|
||||
T17 = FNMS(KP554958132, TR, TM);
|
||||
T18 = FNMS(KP801937735, T17, TW);
|
||||
T16 = FNMS(KP900968867, T15, T1);
|
||||
ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
|
||||
ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
|
||||
}
|
||||
{
|
||||
E T1r, T1u, T1q, T1t, T1s;
|
||||
T1q = FNMS(KP356895867, T1b, T1a);
|
||||
T1r = FNMS(KP692021471, T1q, T19);
|
||||
T1t = FNMS(KP554958132, T1h, T1g);
|
||||
T1u = FNMS(KP801937735, T1t, T1i);
|
||||
T1s = FNMS(KP900968867, T1r, T1c);
|
||||
ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
|
||||
ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 18, 12, 54, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_7) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_7, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 60 FP multiplications,
|
||||
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
|
||||
* 29 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
|
||||
T1 = ri[0];
|
||||
TR = ii[0];
|
||||
{
|
||||
E T6, TA, Tb, TB;
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TA = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 6)];
|
||||
Ta = ii[WS(rs, 6)];
|
||||
T7 = W[10];
|
||||
T9 = W[11];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
TB = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
Tc = T6 + Tb;
|
||||
TS = Tb - T6;
|
||||
TC = TA - TB;
|
||||
TO = TA + TB;
|
||||
}
|
||||
{
|
||||
E Th, TG, Tm, TH;
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 2)];
|
||||
Tg = ii[WS(rs, 2)];
|
||||
Td = W[2];
|
||||
Tf = W[3];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TG = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tj, Tl, Ti, Tk;
|
||||
Tj = ri[WS(rs, 5)];
|
||||
Tl = ii[WS(rs, 5)];
|
||||
Ti = W[8];
|
||||
Tk = W[9];
|
||||
Tm = FMA(Ti, Tj, Tk * Tl);
|
||||
TH = FNMS(Tk, Tj, Ti * Tl);
|
||||
}
|
||||
Tn = Th + Tm;
|
||||
TT = Tm - Th;
|
||||
TI = TG - TH;
|
||||
TP = TG + TH;
|
||||
}
|
||||
{
|
||||
E Ts, TD, Tx, TE;
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 3)];
|
||||
Tr = ii[WS(rs, 3)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TD = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tu, Tw, Tt, Tv;
|
||||
Tu = ri[WS(rs, 4)];
|
||||
Tw = ii[WS(rs, 4)];
|
||||
Tt = W[6];
|
||||
Tv = W[7];
|
||||
Tx = FMA(Tt, Tu, Tv * Tw);
|
||||
TE = FNMS(Tv, Tu, Tt * Tw);
|
||||
}
|
||||
Ty = Ts + Tx;
|
||||
TU = Tx - Ts;
|
||||
TF = TD - TE;
|
||||
TQ = TD + TE;
|
||||
}
|
||||
ri[0] = T1 + Tc + Tn + Ty;
|
||||
ii[0] = TO + TP + TQ + TR;
|
||||
{
|
||||
E TJ, Tz, TX, TY;
|
||||
TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
|
||||
Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
|
||||
ri[WS(rs, 5)] = Tz - TJ;
|
||||
ri[WS(rs, 2)] = Tz + TJ;
|
||||
TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
|
||||
TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
|
||||
ii[WS(rs, 2)] = TX + TY;
|
||||
ii[WS(rs, 5)] = TY - TX;
|
||||
}
|
||||
{
|
||||
E TL, TK, TV, TW;
|
||||
TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
|
||||
TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
|
||||
ri[WS(rs, 6)] = TK - TL;
|
||||
ri[WS(rs, 1)] = TK + TL;
|
||||
TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
|
||||
TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
|
||||
ii[WS(rs, 1)] = TV + TW;
|
||||
ii[WS(rs, 6)] = TW - TV;
|
||||
}
|
||||
{
|
||||
E TN, TM, TZ, T10;
|
||||
TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
|
||||
TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
|
||||
ri[WS(rs, 4)] = TM - TN;
|
||||
ri[WS(rs, 3)] = TM + TN;
|
||||
TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
|
||||
T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
|
||||
ii[WS(rs, 3)] = TZ + T10;
|
||||
ii[WS(rs, 4)] = T10 - TZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 36, 24, 36, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_7) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_7, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,376 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 34 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
|
||||
E TX, Ty, TZ, TV, T10;
|
||||
T1 = ri[0];
|
||||
T1m = ii[0];
|
||||
{
|
||||
E T3, T6, T4, T1k, T2, T5;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T6 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1k = T2 * T6;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1l = FNMS(T5, T3, T1k);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TR, Tf, Ti;
|
||||
Tg = ri[WS(rs, 6)];
|
||||
Tj = ii[WS(rs, 6)];
|
||||
Tf = W[10];
|
||||
Th = Tf * Tg;
|
||||
TR = Tf * Tj;
|
||||
Ti = W[11];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TS = FNMS(Ti, Tg, TR);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TP, T9, Tc;
|
||||
Ta = ri[WS(rs, 2)];
|
||||
Td = ii[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TP = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TQ = FNMS(Tc, Ta, TP);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
|
||||
TB = ri[WS(rs, 7)];
|
||||
TE = ii[WS(rs, 7)];
|
||||
TA = W[12];
|
||||
TC = TA * TB;
|
||||
T13 = TA * TE;
|
||||
TH = ri[WS(rs, 3)];
|
||||
TK = ii[WS(rs, 3)];
|
||||
TG = W[4];
|
||||
TI = TG * TH;
|
||||
T15 = TG * TK;
|
||||
TD = W[13];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T14 = FNMS(TD, TB, T13);
|
||||
TJ = W[5];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T16 = FNMS(TJ, TH, T15);
|
||||
T12 = TF - TL;
|
||||
T17 = T14 - T16;
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
|
||||
To = ri[WS(rs, 1)];
|
||||
Tr = ii[WS(rs, 1)];
|
||||
Tn = W[0];
|
||||
Tp = Tn * To;
|
||||
TW = Tn * Tr;
|
||||
Tu = ri[WS(rs, 5)];
|
||||
Tx = ii[WS(rs, 5)];
|
||||
Tt = W[8];
|
||||
Tv = Tt * Tu;
|
||||
TY = Tt * Tx;
|
||||
Tq = W[1];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
TX = FNMS(Tq, To, TW);
|
||||
Tw = W[9];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
TZ = FNMS(Tw, Tu, TY);
|
||||
TV = Ts - Ty;
|
||||
T10 = TX - TZ;
|
||||
}
|
||||
{
|
||||
E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
|
||||
{
|
||||
E TO, TT, T1r, T1s;
|
||||
TO = T1 - T7;
|
||||
TT = TQ - TS;
|
||||
TU = TO + TT;
|
||||
T1a = TO - TT;
|
||||
T1r = T1m - T1l;
|
||||
T1s = Te - Tk;
|
||||
T1t = T1r - T1s;
|
||||
T1v = T1s + T1r;
|
||||
}
|
||||
{
|
||||
E T11, T18, T1b, T1c;
|
||||
T11 = TV + T10;
|
||||
T18 = T12 - T17;
|
||||
T19 = T11 + T18;
|
||||
T1w = T18 - T11;
|
||||
T1b = T10 - TV;
|
||||
T1c = T12 + T17;
|
||||
T1d = T1b - T1c;
|
||||
T1u = T1b + T1c;
|
||||
}
|
||||
ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU);
|
||||
ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t);
|
||||
ri[WS(rs, 1)] = FMA(KP707106781, T19, TU);
|
||||
ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
|
||||
ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a);
|
||||
ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v);
|
||||
ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
|
||||
ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
|
||||
}
|
||||
{
|
||||
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
|
||||
{
|
||||
E T8, Tl, T1j, T1n;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
Tm = T8 + Tl;
|
||||
T1e = T8 - Tl;
|
||||
T1j = TQ + TS;
|
||||
T1n = T1l + T1m;
|
||||
T1o = T1j + T1n;
|
||||
T1q = T1n - T1j;
|
||||
}
|
||||
{
|
||||
E Tz, TM, T1f, T1g;
|
||||
Tz = Ts + Ty;
|
||||
TM = TF + TL;
|
||||
TN = Tz + TM;
|
||||
T1p = TM - Tz;
|
||||
T1f = TX + TZ;
|
||||
T1g = T14 + T16;
|
||||
T1h = T1f - T1g;
|
||||
T1i = T1f + T1g;
|
||||
}
|
||||
ri[WS(rs, 4)] = Tm - TN;
|
||||
ii[WS(rs, 4)] = T1o - T1i;
|
||||
ri[0] = Tm + TN;
|
||||
ii[0] = T1i + T1o;
|
||||
ri[WS(rs, 6)] = T1e - T1h;
|
||||
ii[WS(rs, 6)] = T1q - T1p;
|
||||
ri[WS(rs, 2)] = T1e + T1h;
|
||||
ii[WS(rs, 2)] = T1p + T1q;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 44, 14, 22, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 32 FP multiplications,
|
||||
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
|
||||
E TP;
|
||||
{
|
||||
E T1, T18, T6, T17;
|
||||
T1 = ri[0];
|
||||
T18 = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T5 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T17 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T1e = T18 - T17;
|
||||
TH = T1 - T6;
|
||||
T19 = T17 + T18;
|
||||
}
|
||||
{
|
||||
E Tz, TS, TE, TT;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 7)];
|
||||
Ty = ii[WS(rs, 7)];
|
||||
Tv = W[12];
|
||||
Tx = W[13];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
TS = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 3)];
|
||||
TD = ii[WS(rs, 3)];
|
||||
TA = W[4];
|
||||
TC = W[5];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
TT = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T13 = TS + TT;
|
||||
TR = Tz - TE;
|
||||
TU = TS - TT;
|
||||
}
|
||||
{
|
||||
E Tc, TI, Th, TJ;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tb = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TI = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 6)];
|
||||
Tg = ii[WS(rs, 6)];
|
||||
Td = W[10];
|
||||
Tf = W[11];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TJ = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T1f = Tc - Th;
|
||||
TK = TI - TJ;
|
||||
T16 = TI + TJ;
|
||||
}
|
||||
{
|
||||
E To, TN, Tt, TO;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = ri[WS(rs, 1)];
|
||||
Tn = ii[WS(rs, 1)];
|
||||
Tk = W[0];
|
||||
Tm = W[1];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
TN = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = ri[WS(rs, 5)];
|
||||
Ts = ii[WS(rs, 5)];
|
||||
Tp = W[8];
|
||||
Tr = W[9];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
TO = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T12 = TN + TO;
|
||||
TM = To - Tt;
|
||||
TP = TN - TO;
|
||||
}
|
||||
{
|
||||
E Tj, TG, T1b, T1c;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
ri[WS(rs, 4)] = Tj - TG;
|
||||
ri[0] = Tj + TG;
|
||||
{
|
||||
E T15, T1a, T11, T14;
|
||||
T15 = T12 + T13;
|
||||
T1a = T16 + T19;
|
||||
ii[0] = T15 + T1a;
|
||||
ii[WS(rs, 4)] = T1a - T15;
|
||||
T11 = T7 - Ti;
|
||||
T14 = T12 - T13;
|
||||
ri[WS(rs, 6)] = T11 - T14;
|
||||
ri[WS(rs, 2)] = T11 + T14;
|
||||
}
|
||||
T1b = TF - Tu;
|
||||
T1c = T19 - T16;
|
||||
ii[WS(rs, 2)] = T1b + T1c;
|
||||
ii[WS(rs, 6)] = T1c - T1b;
|
||||
{
|
||||
E TX, T1g, T10, T1d, TY, TZ;
|
||||
TX = TH - TK;
|
||||
T1g = T1e - T1f;
|
||||
TY = TP - TM;
|
||||
TZ = TR + TU;
|
||||
T10 = KP707106781 * (TY - TZ);
|
||||
T1d = KP707106781 * (TY + TZ);
|
||||
ri[WS(rs, 7)] = TX - T10;
|
||||
ii[WS(rs, 5)] = T1g - T1d;
|
||||
ri[WS(rs, 3)] = TX + T10;
|
||||
ii[WS(rs, 1)] = T1d + T1g;
|
||||
}
|
||||
{
|
||||
E TL, T1i, TW, T1h, TQ, TV;
|
||||
TL = TH + TK;
|
||||
T1i = T1f + T1e;
|
||||
TQ = TM + TP;
|
||||
TV = TR - TU;
|
||||
TW = KP707106781 * (TQ + TV);
|
||||
T1h = KP707106781 * (TV - TQ);
|
||||
ri[WS(rs, 5)] = TL - TW;
|
||||
ii[WS(rs, 7)] = T1i - T1h;
|
||||
ri[WS(rs, 1)] = TL + TW;
|
||||
ii[WS(rs, 3)] = T1h + T1i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 52, 18, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_8, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,487 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 88 FP multiplications,
|
||||
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
|
||||
* 55 stack variables, 10 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
|
||||
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
|
||||
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
|
||||
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
|
||||
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T1, T1R, Te, T1W, T10, T1Q, T1l, T1r, Ty, T1p, Tl, T1o, T1g, T1q, T1a;
|
||||
E T1d, TS, T18, TF, T13, T19, T1c;
|
||||
T1 = ri[0];
|
||||
T1R = ii[0];
|
||||
{
|
||||
E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T6 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TW = T2 * T6;
|
||||
T9 = ri[WS(rs, 6)];
|
||||
Tc = ii[WS(rs, 6)];
|
||||
T8 = W[10];
|
||||
Ta = T8 * T9;
|
||||
TY = T8 * Tc;
|
||||
{
|
||||
E T7, TX, Td, TZ, T5, Tb;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TX = FNMS(T5, T3, TW);
|
||||
Tb = W[11];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
TZ = FNMS(Tb, T9, TY);
|
||||
Te = T7 + Td;
|
||||
T1W = Td - T7;
|
||||
T10 = TX - TZ;
|
||||
T1Q = TX + TZ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Th, Tk, Ti, T1n, Tx, T1i, Tr, T1k, Tg, Tj;
|
||||
Th = ri[WS(rs, 1)];
|
||||
Tk = ii[WS(rs, 1)];
|
||||
Tg = W[0];
|
||||
Ti = Tg * Th;
|
||||
T1n = Tg * Tk;
|
||||
{
|
||||
E Tt, Tw, Tu, T1h, Ts, Tv;
|
||||
Tt = ri[WS(rs, 7)];
|
||||
Tw = ii[WS(rs, 7)];
|
||||
Ts = W[12];
|
||||
Tu = Ts * Tt;
|
||||
T1h = Ts * Tw;
|
||||
Tv = W[13];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1i = FNMS(Tv, Tt, T1h);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1j, Tm, Tp;
|
||||
Tn = ri[WS(rs, 4)];
|
||||
Tq = ii[WS(rs, 4)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
T1j = Tm * Tq;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1k = FNMS(Tp, Tn, T1j);
|
||||
}
|
||||
T1l = T1i - T1k;
|
||||
T1r = Tr - Tx;
|
||||
Ty = Tr + Tx;
|
||||
T1p = T1k + T1i;
|
||||
Tj = W[1];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T1o = FNMS(Tj, Th, T1n);
|
||||
T1g = FNMS(KP500000000, Ty, Tl);
|
||||
T1q = FNMS(KP500000000, T1p, T1o);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T12, TR, T17, TL, T15, TA, TD;
|
||||
TB = ri[WS(rs, 2)];
|
||||
TE = ii[WS(rs, 2)];
|
||||
TA = W[2];
|
||||
TC = TA * TB;
|
||||
T12 = TA * TE;
|
||||
{
|
||||
E TN, TQ, TO, T16, TM, TP;
|
||||
TN = ri[WS(rs, 8)];
|
||||
TQ = ii[WS(rs, 8)];
|
||||
TM = W[14];
|
||||
TO = TM * TN;
|
||||
T16 = TM * TQ;
|
||||
TP = W[15];
|
||||
TR = FMA(TP, TQ, TO);
|
||||
T17 = FNMS(TP, TN, T16);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T14, TG, TJ;
|
||||
TH = ri[WS(rs, 5)];
|
||||
TK = ii[WS(rs, 5)];
|
||||
TG = W[8];
|
||||
TI = TG * TH;
|
||||
T14 = TG * TK;
|
||||
TJ = W[9];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T15 = FNMS(TJ, TH, T14);
|
||||
}
|
||||
T1a = TR - TL;
|
||||
T1d = T15 - T17;
|
||||
TS = TL + TR;
|
||||
T18 = T15 + T17;
|
||||
TD = W[3];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T13 = FNMS(TD, TB, T12);
|
||||
T19 = FNMS(KP500000000, T18, T13);
|
||||
T1c = FNMS(KP500000000, TS, TF);
|
||||
}
|
||||
{
|
||||
E Tf, T1S, TU, T1U, T1O, T1P, T1L, T1T;
|
||||
Tf = T1 + Te;
|
||||
T1S = T1Q + T1R;
|
||||
{
|
||||
E Tz, TT, T1M, T1N;
|
||||
Tz = Tl + Ty;
|
||||
TT = TF + TS;
|
||||
TU = Tz + TT;
|
||||
T1U = TT - Tz;
|
||||
T1M = T1o + T1p;
|
||||
T1N = T13 + T18;
|
||||
T1O = T1M - T1N;
|
||||
T1P = T1M + T1N;
|
||||
}
|
||||
ri[0] = Tf + TU;
|
||||
ii[0] = T1P + T1S;
|
||||
T1L = FNMS(KP500000000, TU, Tf);
|
||||
ri[WS(rs, 6)] = FNMS(KP866025403, T1O, T1L);
|
||||
ri[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
|
||||
T1T = FNMS(KP500000000, T1P, T1S);
|
||||
ii[WS(rs, 3)] = FMA(KP866025403, T1U, T1T);
|
||||
ii[WS(rs, 6)] = FNMS(KP866025403, T1U, T1T);
|
||||
}
|
||||
{
|
||||
E T11, T1z, T1X, T21, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G;
|
||||
E T22, TV, T1V;
|
||||
TV = FNMS(KP500000000, Te, T1);
|
||||
T11 = FMA(KP866025403, T10, TV);
|
||||
T1z = FNMS(KP866025403, T10, TV);
|
||||
T1V = FNMS(KP500000000, T1Q, T1R);
|
||||
T1X = FMA(KP866025403, T1W, T1V);
|
||||
T21 = FNMS(KP866025403, T1W, T1V);
|
||||
{
|
||||
E T1b, T1e, T1m, T1s;
|
||||
T1b = FMA(KP866025403, T1a, T19);
|
||||
T1e = FMA(KP866025403, T1d, T1c);
|
||||
T1f = FMA(KP176326980, T1e, T1b);
|
||||
T1w = FNMS(KP176326980, T1b, T1e);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1s = FNMS(KP866025403, T1r, T1q);
|
||||
T1t = FMA(KP839099631, T1s, T1m);
|
||||
T1x = FNMS(KP839099631, T1m, T1s);
|
||||
}
|
||||
T1u = FMA(KP777861913, T1t, T1f);
|
||||
T1Y = FNMS(KP777861913, T1x, T1w);
|
||||
{
|
||||
E T1A, T1B, T1D, T1E;
|
||||
T1A = FMA(KP866025403, T1r, T1q);
|
||||
T1B = FMA(KP866025403, T1l, T1g);
|
||||
T1C = FMA(KP176326980, T1B, T1A);
|
||||
T1I = FNMS(KP176326980, T1A, T1B);
|
||||
T1D = FNMS(KP866025403, T1d, T1c);
|
||||
T1E = FNMS(KP866025403, T1a, T19);
|
||||
T1F = FNMS(KP363970234, T1E, T1D);
|
||||
T1J = FMA(KP363970234, T1D, T1E);
|
||||
}
|
||||
T1G = FNMS(KP954188894, T1F, T1C);
|
||||
T22 = FMA(KP954188894, T1J, T1I);
|
||||
ri[WS(rs, 1)] = FMA(KP984807753, T1u, T11);
|
||||
ii[WS(rs, 1)] = FNMS(KP984807753, T1Y, T1X);
|
||||
ri[WS(rs, 2)] = FMA(KP984807753, T1G, T1z);
|
||||
ii[WS(rs, 2)] = FNMS(KP984807753, T22, T21);
|
||||
{
|
||||
E T1v, T1y, T1Z, T20;
|
||||
T1v = FNMS(KP492403876, T1u, T11);
|
||||
T1y = FMA(KP777861913, T1x, T1w);
|
||||
ri[WS(rs, 4)] = FMA(KP852868531, T1y, T1v);
|
||||
ri[WS(rs, 7)] = FNMS(KP852868531, T1y, T1v);
|
||||
T1Z = FMA(KP492403876, T1Y, T1X);
|
||||
T20 = FNMS(KP777861913, T1t, T1f);
|
||||
ii[WS(rs, 4)] = FMA(KP852868531, T20, T1Z);
|
||||
ii[WS(rs, 7)] = FNMS(KP852868531, T20, T1Z);
|
||||
}
|
||||
{
|
||||
E T1H, T1K, T23, T24;
|
||||
T1H = FNMS(KP492403876, T1G, T1z);
|
||||
T1K = FNMS(KP954188894, T1J, T1I);
|
||||
ri[WS(rs, 5)] = FNMS(KP852868531, T1K, T1H);
|
||||
ri[WS(rs, 8)] = FMA(KP852868531, T1K, T1H);
|
||||
T23 = FMA(KP492403876, T22, T21);
|
||||
T24 = FMA(KP954188894, T1F, T1C);
|
||||
ii[WS(rs, 5)] = FNMS(KP852868531, T24, T23);
|
||||
ii[WS(rs, 8)] = FMA(KP852868531, T24, T23);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 24, 16, 72, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_9) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_9, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 72 FP multiplications,
|
||||
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
|
||||
* 41 stack variables, 8 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu;
|
||||
E T1w, TW, T1k, T11, T1l;
|
||||
{
|
||||
E T6, TO, Tb, TP;
|
||||
T1 = ri[0];
|
||||
T1B = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T5 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TO = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 6)];
|
||||
Ta = ii[WS(rs, 6)];
|
||||
T7 = W[10];
|
||||
T9 = W[11];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
TP = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
TQ = KP866025403 * (TO - TP);
|
||||
T1G = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
TN = FNMS(KP500000000, Tc, T1);
|
||||
T1A = TO + TP;
|
||||
T1H = FNMS(KP500000000, T1A, T1B);
|
||||
}
|
||||
{
|
||||
E Tz, T19, TE, T14, TJ, T15, TK, T1a;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 2)];
|
||||
Ty = ii[WS(rs, 2)];
|
||||
Tv = W[2];
|
||||
Tx = W[3];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T19 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 5)];
|
||||
TD = ii[WS(rs, 5)];
|
||||
TA = W[8];
|
||||
TC = W[9];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T14 = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = ri[WS(rs, 8)];
|
||||
TI = ii[WS(rs, 8)];
|
||||
TF = W[14];
|
||||
TH = W[15];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T15 = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
TK = TE + TJ;
|
||||
T1a = T14 + T15;
|
||||
TL = Tz + TK;
|
||||
T1x = T19 + T1a;
|
||||
{
|
||||
E T13, T16, T18, T1b;
|
||||
T13 = FNMS(KP500000000, TK, Tz);
|
||||
T16 = KP866025403 * (T14 - T15);
|
||||
T17 = T13 + T16;
|
||||
T1o = T13 - T16;
|
||||
T18 = KP866025403 * (TJ - TE);
|
||||
T1b = FNMS(KP500000000, T1a, T19);
|
||||
T1c = T18 + T1b;
|
||||
T1n = T1b - T18;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, TY, Tn, TT, Ts, TU, Tt, TZ;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = ri[WS(rs, 1)];
|
||||
Th = ii[WS(rs, 1)];
|
||||
Te = W[0];
|
||||
Tg = W[1];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
TY = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 4)];
|
||||
Tm = ii[WS(rs, 4)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TT = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 7)];
|
||||
Tr = ii[WS(rs, 7)];
|
||||
To = W[12];
|
||||
Tq = W[13];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TU = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn + Ts;
|
||||
TZ = TT + TU;
|
||||
Tu = Ti + Tt;
|
||||
T1w = TY + TZ;
|
||||
{
|
||||
E TS, TV, TX, T10;
|
||||
TS = FNMS(KP500000000, Tt, Ti);
|
||||
TV = KP866025403 * (TT - TU);
|
||||
TW = TS + TV;
|
||||
T1k = TS - TV;
|
||||
TX = KP866025403 * (Ts - Tn);
|
||||
T10 = FNMS(KP500000000, TZ, TY);
|
||||
T11 = TX + T10;
|
||||
T1l = T10 - TX;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1y, Td, TM, T1v;
|
||||
T1y = KP866025403 * (T1w - T1x);
|
||||
Td = T1 + Tc;
|
||||
TM = Tu + TL;
|
||||
T1v = FNMS(KP500000000, TM, Td);
|
||||
ri[0] = Td + TM;
|
||||
ri[WS(rs, 3)] = T1v + T1y;
|
||||
ri[WS(rs, 6)] = T1v - T1y;
|
||||
}
|
||||
{
|
||||
E T1D, T1z, T1C, T1E;
|
||||
T1D = KP866025403 * (TL - Tu);
|
||||
T1z = T1w + T1x;
|
||||
T1C = T1A + T1B;
|
||||
T1E = FNMS(KP500000000, T1z, T1C);
|
||||
ii[0] = T1z + T1C;
|
||||
ii[WS(rs, 6)] = T1E - T1D;
|
||||
ii[WS(rs, 3)] = T1D + T1E;
|
||||
}
|
||||
{
|
||||
E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K;
|
||||
TR = TN + TQ;
|
||||
T1I = T1G + T1H;
|
||||
{
|
||||
E T12, T1d, T1g, T1h;
|
||||
T12 = FMA(KP766044443, TW, KP642787609 * T11);
|
||||
T1d = FMA(KP173648177, T17, KP984807753 * T1c);
|
||||
T1e = T12 + T1d;
|
||||
T1J = KP866025403 * (T1d - T12);
|
||||
T1g = FNMS(KP642787609, TW, KP766044443 * T11);
|
||||
T1h = FNMS(KP984807753, T17, KP173648177 * T1c);
|
||||
T1i = KP866025403 * (T1g - T1h);
|
||||
T1F = T1g + T1h;
|
||||
}
|
||||
ri[WS(rs, 1)] = TR + T1e;
|
||||
ii[WS(rs, 1)] = T1F + T1I;
|
||||
T1f = FNMS(KP500000000, T1e, TR);
|
||||
ri[WS(rs, 7)] = T1f - T1i;
|
||||
ri[WS(rs, 4)] = T1f + T1i;
|
||||
T1K = FNMS(KP500000000, T1F, T1I);
|
||||
ii[WS(rs, 4)] = T1J + T1K;
|
||||
ii[WS(rs, 7)] = T1K - T1J;
|
||||
}
|
||||
{
|
||||
E T1j, T1M, T1q, T1N, T1u, T1L, T1r, T1O;
|
||||
T1j = TN - TQ;
|
||||
T1M = T1H - T1G;
|
||||
{
|
||||
E T1m, T1p, T1s, T1t;
|
||||
T1m = FMA(KP173648177, T1k, KP984807753 * T1l);
|
||||
T1p = FNMS(KP939692620, T1o, KP342020143 * T1n);
|
||||
T1q = T1m + T1p;
|
||||
T1N = KP866025403 * (T1p - T1m);
|
||||
T1s = FNMS(KP984807753, T1k, KP173648177 * T1l);
|
||||
T1t = FMA(KP342020143, T1o, KP939692620 * T1n);
|
||||
T1u = KP866025403 * (T1s + T1t);
|
||||
T1L = T1s - T1t;
|
||||
}
|
||||
ri[WS(rs, 2)] = T1j + T1q;
|
||||
ii[WS(rs, 2)] = T1L + T1M;
|
||||
T1r = FNMS(KP500000000, T1q, T1j);
|
||||
ri[WS(rs, 8)] = T1r - T1u;
|
||||
ri[WS(rs, 5)] = T1r + T1u;
|
||||
T1O = FNMS(KP500000000, T1L, T1M);
|
||||
ii[WS(rs, 5)] = T1N + T1O;
|
||||
ii[WS(rs, 8)] = T1O - T1N;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 60, 36, 36, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_9) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_9, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,509 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 114 FP additions, 94 FP multiplications,
|
||||
* (or, 48 additions, 28 multiplications, 66 fused multiply/add),
|
||||
* 63 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T2, T3, T8, Tc, T5, T6, Tl, T7, TB, TF, T12, TY, To, Ts, Tw;
|
||||
E Tb, Td, Th;
|
||||
{
|
||||
E TA, TX, TE, T11, Ta, T4;
|
||||
T2 = W[0];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
T8 = W[4];
|
||||
TA = T2 * T8;
|
||||
TX = T3 * T8;
|
||||
Tc = W[5];
|
||||
TE = T2 * Tc;
|
||||
T11 = T3 * Tc;
|
||||
T5 = W[1];
|
||||
T6 = W[3];
|
||||
Ta = T2 * T6;
|
||||
Tl = FMA(T5, T6, T4);
|
||||
T7 = FNMS(T5, T6, T4);
|
||||
TB = FMA(T5, Tc, TA);
|
||||
TF = FNMS(T5, T8, TE);
|
||||
T12 = FNMS(T6, T8, T11);
|
||||
TY = FMA(T6, Tc, TX);
|
||||
{
|
||||
E Tr, Tv, T9, Tg;
|
||||
Tr = Tl * T8;
|
||||
Tv = Tl * Tc;
|
||||
To = FNMS(T5, T3, Ta);
|
||||
Ts = FMA(To, Tc, Tr);
|
||||
Tw = FNMS(To, T8, Tv);
|
||||
T9 = T7 * T8;
|
||||
Tg = T7 * Tc;
|
||||
Tb = FMA(T5, T3, Ta);
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Th = FNMS(Tb, T8, Tg);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tk, T1c, T24, T2d, TW, T19, T1a, T1P, T1Q, T1Z, T1g, T1h, T1i, T1C, T1H;
|
||||
E T2f, Tz, TM, TN, T1S, T1T, T1Y, T1d, T1e, T1f, T1r, T1w, T2e;
|
||||
{
|
||||
E T1, T23, Te, Tf, Ti, T21, Tj, T22;
|
||||
T1 = ri[0];
|
||||
T23 = ii[0];
|
||||
Te = ri[WS(rs, 5)];
|
||||
Tf = Td * Te;
|
||||
Ti = ii[WS(rs, 5)];
|
||||
T21 = Td * Ti;
|
||||
Tj = FMA(Th, Ti, Tf);
|
||||
Tk = T1 - Tj;
|
||||
T1c = T1 + Tj;
|
||||
T22 = FNMS(Th, Te, T21);
|
||||
T24 = T22 + T23;
|
||||
T2d = T23 - T22;
|
||||
}
|
||||
{
|
||||
E TR, T1z, T18, T1G, TV, T1B, T14, T1E;
|
||||
{
|
||||
E TO, TP, TQ, T1y;
|
||||
TO = ri[WS(rs, 4)];
|
||||
TP = T7 * TO;
|
||||
TQ = ii[WS(rs, 4)];
|
||||
T1y = T7 * TQ;
|
||||
TR = FMA(Tb, TQ, TP);
|
||||
T1z = FNMS(Tb, TO, T1y);
|
||||
}
|
||||
{
|
||||
E T15, T16, T17, T1F;
|
||||
T15 = ri[WS(rs, 1)];
|
||||
T16 = T2 * T15;
|
||||
T17 = ii[WS(rs, 1)];
|
||||
T1F = T2 * T17;
|
||||
T18 = FMA(T5, T17, T16);
|
||||
T1G = FNMS(T5, T15, T1F);
|
||||
}
|
||||
{
|
||||
E TS, TT, TU, T1A;
|
||||
TS = ri[WS(rs, 9)];
|
||||
TT = T8 * TS;
|
||||
TU = ii[WS(rs, 9)];
|
||||
T1A = T8 * TU;
|
||||
TV = FMA(Tc, TU, TT);
|
||||
T1B = FNMS(Tc, TS, T1A);
|
||||
}
|
||||
{
|
||||
E TZ, T10, T13, T1D;
|
||||
TZ = ri[WS(rs, 6)];
|
||||
T10 = TY * TZ;
|
||||
T13 = ii[WS(rs, 6)];
|
||||
T1D = TY * T13;
|
||||
T14 = FMA(T12, T13, T10);
|
||||
T1E = FNMS(T12, TZ, T1D);
|
||||
}
|
||||
TW = TR - TV;
|
||||
T19 = T14 - T18;
|
||||
T1a = TW + T19;
|
||||
T1P = T1z + T1B;
|
||||
T1Q = T1E + T1G;
|
||||
T1Z = T1P + T1Q;
|
||||
T1g = TR + TV;
|
||||
T1h = T14 + T18;
|
||||
T1i = T1g + T1h;
|
||||
T1C = T1z - T1B;
|
||||
T1H = T1E - T1G;
|
||||
T2f = T1C + T1H;
|
||||
}
|
||||
{
|
||||
E Tq, T1o, TL, T1v, Ty, T1q, TH, T1t;
|
||||
{
|
||||
E Tm, Tn, Tp, T1n;
|
||||
Tm = ri[WS(rs, 2)];
|
||||
Tn = Tl * Tm;
|
||||
Tp = ii[WS(rs, 2)];
|
||||
T1n = Tl * Tp;
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
T1o = FNMS(To, Tm, T1n);
|
||||
}
|
||||
{
|
||||
E TI, TJ, TK, T1u;
|
||||
TI = ri[WS(rs, 3)];
|
||||
TJ = T3 * TI;
|
||||
TK = ii[WS(rs, 3)];
|
||||
T1u = T3 * TK;
|
||||
TL = FMA(T6, TK, TJ);
|
||||
T1v = FNMS(T6, TI, T1u);
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tx, T1p;
|
||||
Tt = ri[WS(rs, 7)];
|
||||
Tu = Ts * Tt;
|
||||
Tx = ii[WS(rs, 7)];
|
||||
T1p = Ts * Tx;
|
||||
Ty = FMA(Tw, Tx, Tu);
|
||||
T1q = FNMS(Tw, Tt, T1p);
|
||||
}
|
||||
{
|
||||
E TC, TD, TG, T1s;
|
||||
TC = ri[WS(rs, 8)];
|
||||
TD = TB * TC;
|
||||
TG = ii[WS(rs, 8)];
|
||||
T1s = TB * TG;
|
||||
TH = FMA(TF, TG, TD);
|
||||
T1t = FNMS(TF, TC, T1s);
|
||||
}
|
||||
Tz = Tq - Ty;
|
||||
TM = TH - TL;
|
||||
TN = Tz + TM;
|
||||
T1S = T1o + T1q;
|
||||
T1T = T1t + T1v;
|
||||
T1Y = T1S + T1T;
|
||||
T1d = Tq + Ty;
|
||||
T1e = TH + TL;
|
||||
T1f = T1d + T1e;
|
||||
T1r = T1o - T1q;
|
||||
T1w = T1t - T1v;
|
||||
T2e = T1r + T1w;
|
||||
}
|
||||
{
|
||||
E T1l, T1b, T1k, T1J, T1L, T1x, T1I, T1K, T1m;
|
||||
T1l = TN - T1a;
|
||||
T1b = TN + T1a;
|
||||
T1k = FNMS(KP250000000, T1b, Tk);
|
||||
T1x = T1r - T1w;
|
||||
T1I = T1C - T1H;
|
||||
T1J = FMA(KP618033988, T1I, T1x);
|
||||
T1L = FNMS(KP618033988, T1x, T1I);
|
||||
ri[WS(rs, 5)] = Tk + T1b;
|
||||
T1K = FNMS(KP559016994, T1l, T1k);
|
||||
ri[WS(rs, 7)] = FNMS(KP951056516, T1L, T1K);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, T1L, T1K);
|
||||
T1m = FMA(KP559016994, T1l, T1k);
|
||||
ri[WS(rs, 9)] = FNMS(KP951056516, T1J, T1m);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, T1J, T1m);
|
||||
}
|
||||
{
|
||||
E T2i, T2g, T2h, T2m, T2o, T2k, T2l, T2n, T2j;
|
||||
T2i = T2e - T2f;
|
||||
T2g = T2e + T2f;
|
||||
T2h = FNMS(KP250000000, T2g, T2d);
|
||||
T2k = Tz - TM;
|
||||
T2l = TW - T19;
|
||||
T2m = FMA(KP618033988, T2l, T2k);
|
||||
T2o = FNMS(KP618033988, T2k, T2l);
|
||||
ii[WS(rs, 5)] = T2g + T2d;
|
||||
T2n = FNMS(KP559016994, T2i, T2h);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, T2o, T2n);
|
||||
ii[WS(rs, 7)] = FMA(KP951056516, T2o, T2n);
|
||||
T2j = FMA(KP559016994, T2i, T2h);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, T2m, T2j);
|
||||
ii[WS(rs, 9)] = FMA(KP951056516, T2m, T2j);
|
||||
}
|
||||
{
|
||||
E T1N, T1j, T1M, T1V, T1X, T1R, T1U, T1W, T1O;
|
||||
T1N = T1f - T1i;
|
||||
T1j = T1f + T1i;
|
||||
T1M = FNMS(KP250000000, T1j, T1c);
|
||||
T1R = T1P - T1Q;
|
||||
T1U = T1S - T1T;
|
||||
T1V = FNMS(KP618033988, T1U, T1R);
|
||||
T1X = FMA(KP618033988, T1R, T1U);
|
||||
ri[0] = T1c + T1j;
|
||||
T1W = FMA(KP559016994, T1N, T1M);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, T1X, T1W);
|
||||
ri[WS(rs, 6)] = FMA(KP951056516, T1X, T1W);
|
||||
T1O = FNMS(KP559016994, T1N, T1M);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, T1V, T1O);
|
||||
ri[WS(rs, 8)] = FMA(KP951056516, T1V, T1O);
|
||||
}
|
||||
{
|
||||
E T26, T20, T25, T2a, T2c, T28, T29, T2b, T27;
|
||||
T26 = T1Y - T1Z;
|
||||
T20 = T1Y + T1Z;
|
||||
T25 = FNMS(KP250000000, T20, T24);
|
||||
T28 = T1g - T1h;
|
||||
T29 = T1d - T1e;
|
||||
T2a = FNMS(KP618033988, T29, T28);
|
||||
T2c = FMA(KP618033988, T28, T29);
|
||||
ii[0] = T20 + T24;
|
||||
T2b = FMA(KP559016994, T26, T25);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, T2c, T2b);
|
||||
ii[WS(rs, 6)] = FNMS(KP951056516, T2c, T2b);
|
||||
T27 = FNMS(KP559016994, T26, T25);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, T2a, T27);
|
||||
ii[WS(rs, 8)] = FNMS(KP951056516, T2a, T27);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 48, 28, 66, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_10, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 114 FP additions, 80 FP multiplications,
|
||||
* (or, 76 additions, 42 multiplications, 38 fused multiply/add),
|
||||
* 63 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T2, T5, T3, T6, T8, Tm, Tc, Tk, T9, Td, Te, TM, TO, Tg, Tp;
|
||||
E Tv, Tx, Tr;
|
||||
{
|
||||
E T4, Tb, T7, Ta;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
T3 = W[2];
|
||||
T6 = W[3];
|
||||
T4 = T2 * T3;
|
||||
Tb = T5 * T3;
|
||||
T7 = T5 * T6;
|
||||
Ta = T2 * T6;
|
||||
T8 = T4 - T7;
|
||||
Tm = Ta - Tb;
|
||||
Tc = Ta + Tb;
|
||||
Tk = T4 + T7;
|
||||
T9 = W[4];
|
||||
Td = W[5];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
TM = FMA(T3, T9, T6 * Td);
|
||||
TO = FNMS(T6, T9, T3 * Td);
|
||||
Tg = FNMS(Tc, T9, T8 * Td);
|
||||
Tp = FMA(Tk, T9, Tm * Td);
|
||||
Tv = FMA(T2, T9, T5 * Td);
|
||||
Tx = FNMS(T5, T9, T2 * Td);
|
||||
Tr = FNMS(Tm, T9, Tk * Td);
|
||||
}
|
||||
{
|
||||
E Tj, T1S, TX, T1G, TL, TU, TV, T1s, T1t, T1C, T11, T12, T13, T1h, T1k;
|
||||
E T1Q, Tu, TD, TE, T1v, T1w, T1B, TY, TZ, T10, T1a, T1d, T1P;
|
||||
{
|
||||
E T1, T1F, Ti, T1E, Tf, Th;
|
||||
T1 = ri[0];
|
||||
T1F = ii[0];
|
||||
Tf = ri[WS(rs, 5)];
|
||||
Th = ii[WS(rs, 5)];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T1E = FNMS(Tg, Tf, Te * Th);
|
||||
Tj = T1 - Ti;
|
||||
T1S = T1F - T1E;
|
||||
TX = T1 + Ti;
|
||||
T1G = T1E + T1F;
|
||||
}
|
||||
{
|
||||
E TH, T1f, TT, T1j, TK, T1g, TQ, T1i;
|
||||
{
|
||||
E TF, TG, TR, TS;
|
||||
TF = ri[WS(rs, 4)];
|
||||
TG = ii[WS(rs, 4)];
|
||||
TH = FMA(T8, TF, Tc * TG);
|
||||
T1f = FNMS(Tc, TF, T8 * TG);
|
||||
TR = ri[WS(rs, 1)];
|
||||
TS = ii[WS(rs, 1)];
|
||||
TT = FMA(T2, TR, T5 * TS);
|
||||
T1j = FNMS(T5, TR, T2 * TS);
|
||||
}
|
||||
{
|
||||
E TI, TJ, TN, TP;
|
||||
TI = ri[WS(rs, 9)];
|
||||
TJ = ii[WS(rs, 9)];
|
||||
TK = FMA(T9, TI, Td * TJ);
|
||||
T1g = FNMS(Td, TI, T9 * TJ);
|
||||
TN = ri[WS(rs, 6)];
|
||||
TP = ii[WS(rs, 6)];
|
||||
TQ = FMA(TM, TN, TO * TP);
|
||||
T1i = FNMS(TO, TN, TM * TP);
|
||||
}
|
||||
TL = TH - TK;
|
||||
TU = TQ - TT;
|
||||
TV = TL + TU;
|
||||
T1s = T1f + T1g;
|
||||
T1t = T1i + T1j;
|
||||
T1C = T1s + T1t;
|
||||
T11 = TH + TK;
|
||||
T12 = TQ + TT;
|
||||
T13 = T11 + T12;
|
||||
T1h = T1f - T1g;
|
||||
T1k = T1i - T1j;
|
||||
T1Q = T1h + T1k;
|
||||
}
|
||||
{
|
||||
E To, T18, TC, T1c, Tt, T19, Tz, T1b;
|
||||
{
|
||||
E Tl, Tn, TA, TB;
|
||||
Tl = ri[WS(rs, 2)];
|
||||
Tn = ii[WS(rs, 2)];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
T18 = FNMS(Tm, Tl, Tk * Tn);
|
||||
TA = ri[WS(rs, 3)];
|
||||
TB = ii[WS(rs, 3)];
|
||||
TC = FMA(T3, TA, T6 * TB);
|
||||
T1c = FNMS(T6, TA, T3 * TB);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tw, Ty;
|
||||
Tq = ri[WS(rs, 7)];
|
||||
Ts = ii[WS(rs, 7)];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
T19 = FNMS(Tr, Tq, Tp * Ts);
|
||||
Tw = ri[WS(rs, 8)];
|
||||
Ty = ii[WS(rs, 8)];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1b = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
Tu = To - Tt;
|
||||
TD = Tz - TC;
|
||||
TE = Tu + TD;
|
||||
T1v = T18 + T19;
|
||||
T1w = T1b + T1c;
|
||||
T1B = T1v + T1w;
|
||||
TY = To + Tt;
|
||||
TZ = Tz + TC;
|
||||
T10 = TY + TZ;
|
||||
T1a = T18 - T19;
|
||||
T1d = T1b - T1c;
|
||||
T1P = T1a + T1d;
|
||||
}
|
||||
{
|
||||
E T15, TW, T16, T1m, T1o, T1e, T1l, T1n, T17;
|
||||
T15 = KP559016994 * (TE - TV);
|
||||
TW = TE + TV;
|
||||
T16 = FNMS(KP250000000, TW, Tj);
|
||||
T1e = T1a - T1d;
|
||||
T1l = T1h - T1k;
|
||||
T1m = FMA(KP951056516, T1e, KP587785252 * T1l);
|
||||
T1o = FNMS(KP587785252, T1e, KP951056516 * T1l);
|
||||
ri[WS(rs, 5)] = Tj + TW;
|
||||
T1n = T16 - T15;
|
||||
ri[WS(rs, 7)] = T1n - T1o;
|
||||
ri[WS(rs, 3)] = T1n + T1o;
|
||||
T17 = T15 + T16;
|
||||
ri[WS(rs, 9)] = T17 - T1m;
|
||||
ri[WS(rs, 1)] = T17 + T1m;
|
||||
}
|
||||
{
|
||||
E T1R, T1T, T1U, T1Y, T20, T1W, T1X, T1Z, T1V;
|
||||
T1R = KP559016994 * (T1P - T1Q);
|
||||
T1T = T1P + T1Q;
|
||||
T1U = FNMS(KP250000000, T1T, T1S);
|
||||
T1W = Tu - TD;
|
||||
T1X = TL - TU;
|
||||
T1Y = FMA(KP951056516, T1W, KP587785252 * T1X);
|
||||
T20 = FNMS(KP587785252, T1W, KP951056516 * T1X);
|
||||
ii[WS(rs, 5)] = T1T + T1S;
|
||||
T1Z = T1U - T1R;
|
||||
ii[WS(rs, 3)] = T1Z - T20;
|
||||
ii[WS(rs, 7)] = T20 + T1Z;
|
||||
T1V = T1R + T1U;
|
||||
ii[WS(rs, 1)] = T1V - T1Y;
|
||||
ii[WS(rs, 9)] = T1Y + T1V;
|
||||
}
|
||||
{
|
||||
E T1q, T14, T1p, T1y, T1A, T1u, T1x, T1z, T1r;
|
||||
T1q = KP559016994 * (T10 - T13);
|
||||
T14 = T10 + T13;
|
||||
T1p = FNMS(KP250000000, T14, TX);
|
||||
T1u = T1s - T1t;
|
||||
T1x = T1v - T1w;
|
||||
T1y = FNMS(KP587785252, T1x, KP951056516 * T1u);
|
||||
T1A = FMA(KP951056516, T1x, KP587785252 * T1u);
|
||||
ri[0] = TX + T14;
|
||||
T1z = T1q + T1p;
|
||||
ri[WS(rs, 4)] = T1z - T1A;
|
||||
ri[WS(rs, 6)] = T1z + T1A;
|
||||
T1r = T1p - T1q;
|
||||
ri[WS(rs, 2)] = T1r - T1y;
|
||||
ri[WS(rs, 8)] = T1r + T1y;
|
||||
}
|
||||
{
|
||||
E T1L, T1D, T1K, T1J, T1N, T1H, T1I, T1O, T1M;
|
||||
T1L = KP559016994 * (T1B - T1C);
|
||||
T1D = T1B + T1C;
|
||||
T1K = FNMS(KP250000000, T1D, T1G);
|
||||
T1H = T11 - T12;
|
||||
T1I = TY - TZ;
|
||||
T1J = FNMS(KP587785252, T1I, KP951056516 * T1H);
|
||||
T1N = FMA(KP951056516, T1I, KP587785252 * T1H);
|
||||
ii[0] = T1D + T1G;
|
||||
T1O = T1L + T1K;
|
||||
ii[WS(rs, 4)] = T1N + T1O;
|
||||
ii[WS(rs, 6)] = T1O - T1N;
|
||||
T1M = T1K - T1L;
|
||||
ii[WS(rs, 2)] = T1J + T1M;
|
||||
ii[WS(rs, 8)] = T1M - T1J;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 76, 42, 38, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_10, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,836 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 134 FP multiplications,
|
||||
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
|
||||
* 90 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
|
||||
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
|
||||
{
|
||||
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
|
||||
T2 = W[0];
|
||||
Tf = W[2];
|
||||
Tg = T2 * Tf;
|
||||
TM = W[6];
|
||||
TN = T2 * TM;
|
||||
TO = W[7];
|
||||
TS = T2 * TO;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tp = Tf * T3;
|
||||
T6 = W[5];
|
||||
Ta = T2 * T6;
|
||||
Tt = Tf * T6;
|
||||
T5 = W[1];
|
||||
Th = W[3];
|
||||
Tl = T2 * Th;
|
||||
Tz = FMA(T5, Th, Tg);
|
||||
Ti = FNMS(T5, Th, Tg);
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TZ = FNMS(Th, T3, Tt);
|
||||
TT = FNMS(T5, TM, TS);
|
||||
Tq = FNMS(Th, T6, Tp);
|
||||
TW = FMA(Th, T6, Tp);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
Tu = FMA(Th, T3, Tt);
|
||||
TP = FMA(T5, TO, TN);
|
||||
TI = FMA(T5, T3, Ta);
|
||||
TF = FNMS(T5, T6, T4);
|
||||
{
|
||||
E T1y, T1C, T1e, T1i;
|
||||
T1y = Tz * T3;
|
||||
T1C = Tz * T6;
|
||||
TC = FNMS(T5, Tf, Tl);
|
||||
T1z = FMA(TC, T6, T1y);
|
||||
T1O = FMA(TC, T3, T1C);
|
||||
T1D = FNMS(TC, T3, T1C);
|
||||
T1L = FNMS(TC, T6, T1y);
|
||||
T1e = Ti * T3;
|
||||
T1i = Ti * T6;
|
||||
Tm = FMA(T5, Tf, Tl);
|
||||
T1f = FMA(Tm, T6, T1e);
|
||||
T1p = FMA(Tm, T3, T1i);
|
||||
T1j = FNMS(Tm, T3, T1i);
|
||||
T1m = FNMS(Tm, T6, T1e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
|
||||
E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
|
||||
E T2d, T38;
|
||||
{
|
||||
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
|
||||
T1 = ri[0];
|
||||
T3z = ii[0];
|
||||
T8 = ri[WS(rs, 8)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ii[WS(rs, 8)];
|
||||
T3x = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Te = T1 + Td;
|
||||
T1U = T1 - Td;
|
||||
T3y = FNMS(Tb, T8, T3x);
|
||||
T3A = T3y + T3z;
|
||||
T3L = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
|
||||
T1u = ri[WS(rs, 15)];
|
||||
T1v = TM * T1u;
|
||||
T1w = ii[WS(rs, 15)];
|
||||
T2w = TM * T1w;
|
||||
T1A = ri[WS(rs, 7)];
|
||||
T1B = T1z * T1A;
|
||||
T1E = ii[WS(rs, 7)];
|
||||
T2y = T1z * T1E;
|
||||
{
|
||||
E T1x, T1F, T2x, T2z;
|
||||
T1x = FMA(TO, T1w, T1v);
|
||||
T1F = FMA(T1D, T1E, T1B);
|
||||
T1G = T1x + T1F;
|
||||
T2D = T1x - T1F;
|
||||
T2x = FNMS(TO, T1u, T2w);
|
||||
T2z = FNMS(T1D, T1A, T2y);
|
||||
T2A = T2x - T2z;
|
||||
T3h = T2x + T2z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
|
||||
T1H = ri[WS(rs, 3)];
|
||||
T1I = Tf * T1H;
|
||||
T1J = ii[WS(rs, 3)];
|
||||
T2E = Tf * T1J;
|
||||
T1M = ri[WS(rs, 11)];
|
||||
T1N = T1L * T1M;
|
||||
T1P = ii[WS(rs, 11)];
|
||||
T2G = T1L * T1P;
|
||||
{
|
||||
E T1K, T1Q, T2F, T2H;
|
||||
T1K = FMA(Th, T1J, T1I);
|
||||
T1Q = FMA(T1O, T1P, T1N);
|
||||
T1R = T1K + T1Q;
|
||||
T2B = T1K - T1Q;
|
||||
T2F = FNMS(Th, T1H, T2E);
|
||||
T2H = FNMS(T1O, T1M, T2G);
|
||||
T2I = T2F - T2H;
|
||||
T3i = T2F + T2H;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
|
||||
Tj = ri[WS(rs, 4)];
|
||||
Tk = Ti * Tj;
|
||||
Tn = ii[WS(rs, 4)];
|
||||
T1V = Ti * Tn;
|
||||
Tr = ri[WS(rs, 12)];
|
||||
Ts = Tq * Tr;
|
||||
Tv = ii[WS(rs, 12)];
|
||||
T1X = Tq * Tv;
|
||||
{
|
||||
E To, Tw, T1W, T1Y;
|
||||
To = FMA(Tm, Tn, Tk);
|
||||
Tw = FMA(Tu, Tv, Ts);
|
||||
Tx = To + Tw;
|
||||
T3M = To - Tw;
|
||||
T1W = FNMS(Tm, Tj, T1V);
|
||||
T1Y = FNMS(Tu, Tr, T1X);
|
||||
T1Z = T1W - T1Y;
|
||||
T3w = T1W + T1Y;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, TB, TD, T21, TG, TH, TJ, T23;
|
||||
TA = ri[WS(rs, 2)];
|
||||
TB = Tz * TA;
|
||||
TD = ii[WS(rs, 2)];
|
||||
T21 = Tz * TD;
|
||||
TG = ri[WS(rs, 10)];
|
||||
TH = TF * TG;
|
||||
TJ = ii[WS(rs, 10)];
|
||||
T23 = TF * TJ;
|
||||
{
|
||||
E TE, TK, T22, T24;
|
||||
TE = FMA(TC, TD, TB);
|
||||
TK = FMA(TI, TJ, TH);
|
||||
TL = TE + TK;
|
||||
T26 = TE - TK;
|
||||
T22 = FNMS(TC, TA, T21);
|
||||
T24 = FNMS(TI, TG, T23);
|
||||
T25 = T22 - T24;
|
||||
T37 = T22 + T24;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
|
||||
T15 = ri[WS(rs, 1)];
|
||||
T16 = T2 * T15;
|
||||
T17 = ii[WS(rs, 1)];
|
||||
T2h = T2 * T17;
|
||||
T19 = ri[WS(rs, 9)];
|
||||
T1a = T3 * T19;
|
||||
T1b = ii[WS(rs, 9)];
|
||||
T2j = T3 * T1b;
|
||||
{
|
||||
E T18, T1c, T2i, T2k;
|
||||
T18 = FMA(T5, T17, T16);
|
||||
T1c = FMA(T6, T1b, T1a);
|
||||
T1d = T18 + T1c;
|
||||
T2o = T18 - T1c;
|
||||
T2i = FNMS(T5, T15, T2h);
|
||||
T2k = FNMS(T6, T19, T2j);
|
||||
T2l = T2i - T2k;
|
||||
T3c = T2i + T2k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
|
||||
T1g = ri[WS(rs, 5)];
|
||||
T1h = T1f * T1g;
|
||||
T1k = ii[WS(rs, 5)];
|
||||
T2p = T1f * T1k;
|
||||
T1n = ri[WS(rs, 13)];
|
||||
T1o = T1m * T1n;
|
||||
T1q = ii[WS(rs, 13)];
|
||||
T2r = T1m * T1q;
|
||||
{
|
||||
E T1l, T1r, T2q, T2s;
|
||||
T1l = FMA(T1j, T1k, T1h);
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T1s = T1l + T1r;
|
||||
T2m = T1l - T1r;
|
||||
T2q = FNMS(T1j, T1g, T2p);
|
||||
T2s = FNMS(T1p, T1n, T2r);
|
||||
T2t = T2q - T2s;
|
||||
T3d = T2q + T2s;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TR, TU, T29, TX, TY, T10, T2b;
|
||||
TQ = ri[WS(rs, 14)];
|
||||
TR = TP * TQ;
|
||||
TU = ii[WS(rs, 14)];
|
||||
T29 = TP * TU;
|
||||
TX = ri[WS(rs, 6)];
|
||||
TY = TW * TX;
|
||||
T10 = ii[WS(rs, 6)];
|
||||
T2b = TW * T10;
|
||||
{
|
||||
E TV, T11, T2a, T2c;
|
||||
TV = FMA(TT, TU, TR);
|
||||
T11 = FMA(TZ, T10, TY);
|
||||
T12 = TV + T11;
|
||||
T28 = TV - T11;
|
||||
T2a = FNMS(TT, TQ, T29);
|
||||
T2c = FNMS(TZ, TX, T2b);
|
||||
T2d = T2a - T2c;
|
||||
T38 = T2a + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
|
||||
{
|
||||
E Ty, T13, T3v, T3B;
|
||||
Ty = Te + Tx;
|
||||
T13 = TL + T12;
|
||||
T14 = Ty + T13;
|
||||
T3q = Ty - T13;
|
||||
T3v = T37 + T38;
|
||||
T3B = T3w + T3A;
|
||||
T3C = T3v + T3B;
|
||||
T3E = T3B - T3v;
|
||||
}
|
||||
{
|
||||
E T1t, T1S, T3r, T3s;
|
||||
T1t = T1d + T1s;
|
||||
T1S = T1G + T1R;
|
||||
T1T = T1t + T1S;
|
||||
T3D = T1S - T1t;
|
||||
T3r = T3c + T3d;
|
||||
T3s = T3h + T3i;
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3r + T3s;
|
||||
}
|
||||
ri[WS(rs, 8)] = T14 - T1T;
|
||||
ii[WS(rs, 8)] = T3C - T3u;
|
||||
ri[0] = T14 + T1T;
|
||||
ii[0] = T3u + T3C;
|
||||
ri[WS(rs, 12)] = T3q - T3t;
|
||||
ii[WS(rs, 12)] = T3E - T3D;
|
||||
ri[WS(rs, 4)] = T3q + T3t;
|
||||
ii[WS(rs, 4)] = T3D + T3E;
|
||||
}
|
||||
{
|
||||
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
|
||||
{
|
||||
E T36, T39, T3F, T3G;
|
||||
T36 = Te - Tx;
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 + T39;
|
||||
T3m = T36 - T39;
|
||||
T3F = T12 - TL;
|
||||
T3G = T3A - T3w;
|
||||
T3H = T3F + T3G;
|
||||
T3J = T3G - T3F;
|
||||
}
|
||||
{
|
||||
E T3b, T3e, T3g, T3j;
|
||||
T3b = T1d - T1s;
|
||||
T3e = T3c - T3d;
|
||||
T3f = T3b + T3e;
|
||||
T3n = T3e - T3b;
|
||||
T3g = T1G - T1R;
|
||||
T3j = T3h - T3i;
|
||||
T3k = T3g - T3j;
|
||||
T3o = T3g + T3j;
|
||||
}
|
||||
{
|
||||
E T3l, T3I, T3p, T3K;
|
||||
T3l = T3f + T3k;
|
||||
ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
|
||||
ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
|
||||
T3I = T3n + T3o;
|
||||
ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
|
||||
ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
|
||||
T3p = T3n - T3o;
|
||||
ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
|
||||
ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
|
||||
T3K = T3k - T3f;
|
||||
ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
|
||||
ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
|
||||
E T2O;
|
||||
{
|
||||
E T27, T2e, T2n, T2u;
|
||||
T20 = T1U - T1Z;
|
||||
T3N = T3L - T3M;
|
||||
T3T = T3M + T3L;
|
||||
T2Q = T1U + T1Z;
|
||||
T27 = T25 - T26;
|
||||
T2e = T28 + T2d;
|
||||
T2f = T27 - T2e;
|
||||
T3O = T27 + T2e;
|
||||
{
|
||||
E T2Y, T2Z, T2R, T2S;
|
||||
T2Y = T2D + T2I;
|
||||
T2Z = T2A - T2B;
|
||||
T30 = FNMS(KP414213562, T2Z, T2Y);
|
||||
T34 = FMA(KP414213562, T2Y, T2Z);
|
||||
T2R = T26 + T25;
|
||||
T2S = T28 - T2d;
|
||||
T2T = T2R + T2S;
|
||||
T3U = T2S - T2R;
|
||||
}
|
||||
T2n = T2l + T2m;
|
||||
T2u = T2o - T2t;
|
||||
T2v = FMA(KP414213562, T2u, T2n);
|
||||
T2N = FNMS(KP414213562, T2n, T2u);
|
||||
{
|
||||
E T2V, T2W, T2C, T2J;
|
||||
T2V = T2o + T2t;
|
||||
T2W = T2l - T2m;
|
||||
T2X = FMA(KP414213562, T2W, T2V);
|
||||
T33 = FNMS(KP414213562, T2V, T2W);
|
||||
T2C = T2A + T2B;
|
||||
T2J = T2D - T2I;
|
||||
T2K = FNMS(KP414213562, T2J, T2C);
|
||||
T2O = FMA(KP414213562, T2C, T2J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2g, T2L, T3V, T3W;
|
||||
T2g = FMA(KP707106781, T2f, T20);
|
||||
T2L = T2v - T2K;
|
||||
ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
|
||||
ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
|
||||
T3V = FMA(KP707106781, T3U, T3T);
|
||||
T3W = T2O - T2N;
|
||||
ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
|
||||
ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
|
||||
}
|
||||
{
|
||||
E T2M, T2P, T3X, T3Y;
|
||||
T2M = FNMS(KP707106781, T2f, T20);
|
||||
T2P = T2N + T2O;
|
||||
ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
|
||||
ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
|
||||
T3X = FNMS(KP707106781, T3U, T3T);
|
||||
T3Y = T2v + T2K;
|
||||
ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
|
||||
ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
|
||||
}
|
||||
{
|
||||
E T2U, T31, T3P, T3Q;
|
||||
T2U = FMA(KP707106781, T2T, T2Q);
|
||||
T31 = T2X + T30;
|
||||
ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
|
||||
ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
|
||||
T3P = FMA(KP707106781, T3O, T3N);
|
||||
T3Q = T33 + T34;
|
||||
ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
|
||||
ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
|
||||
}
|
||||
{
|
||||
E T32, T35, T3R, T3S;
|
||||
T32 = FNMS(KP707106781, T2T, T2Q);
|
||||
T35 = T33 - T34;
|
||||
ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
|
||||
ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
|
||||
T3R = FNMS(KP707106781, T3O, T3N);
|
||||
T3S = T30 - T2X;
|
||||
ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
|
||||
ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_CEXP, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 104, 42, 92, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 108 FP multiplications,
|
||||
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
|
||||
* 82 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
|
||||
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
|
||||
{
|
||||
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
|
||||
{
|
||||
E Th, Tn, Tj, Tm;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
Tg = W[2];
|
||||
Ti = W[3];
|
||||
Th = T2 * Tg;
|
||||
Tn = T5 * Tg;
|
||||
Tj = T5 * Ti;
|
||||
Tm = T2 * Ti;
|
||||
Tk = Th - Tj;
|
||||
To = Tm + Tn;
|
||||
TE = Tm - Tn;
|
||||
TC = Th + Tj;
|
||||
T6 = W[5];
|
||||
T7 = T5 * T6;
|
||||
Tv = Tg * T6;
|
||||
Ta = T2 * T6;
|
||||
Ts = Ti * T6;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tw = Ti * T3;
|
||||
Tb = T5 * T3;
|
||||
Tr = Tg * T3;
|
||||
}
|
||||
T8 = T4 + T7;
|
||||
TW = Tv - Tw;
|
||||
TJ = Ta + Tb;
|
||||
Tt = Tr - Ts;
|
||||
TU = Tr + Ts;
|
||||
Tc = Ta - Tb;
|
||||
Tx = Tv + Tw;
|
||||
TH = T4 - T7;
|
||||
TN = W[6];
|
||||
TO = W[7];
|
||||
TP = FMA(T2, TN, T5 * TO);
|
||||
TR = FNMS(T5, TN, T2 * TO);
|
||||
{
|
||||
E T1d, T1e, T19, T1a;
|
||||
T1d = Tk * T6;
|
||||
T1e = To * T3;
|
||||
T1f = T1d - T1e;
|
||||
T1k = T1d + T1e;
|
||||
T19 = Tk * T3;
|
||||
T1a = To * T6;
|
||||
T1b = T19 + T1a;
|
||||
T1i = T19 - T1a;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1s, T1t;
|
||||
T1w = TC * T6;
|
||||
T1x = TE * T3;
|
||||
T1y = T1w - T1x;
|
||||
T1H = T1w + T1x;
|
||||
T1s = TC * T3;
|
||||
T1t = TE * T6;
|
||||
T1u = T1s + T1t;
|
||||
T1F = T1s - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
|
||||
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
|
||||
E T2S, T2T, T28, T2A, T2d, T2B;
|
||||
{
|
||||
E T1, T3d, Te, T3c, T9, Td;
|
||||
T1 = ri[0];
|
||||
T3d = ii[0];
|
||||
T9 = ri[WS(rs, 8)];
|
||||
Td = ii[WS(rs, 8)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T3c = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T3r = T3d - T3c;
|
||||
T1N = T1 - Te;
|
||||
T3e = T3c + T3d;
|
||||
}
|
||||
{
|
||||
E Tq, T1O, Tz, T1P;
|
||||
{
|
||||
E Tl, Tp, Tu, Ty;
|
||||
Tl = ri[WS(rs, 4)];
|
||||
Tp = ii[WS(rs, 4)];
|
||||
Tq = FMA(Tk, Tl, To * Tp);
|
||||
T1O = FNMS(To, Tl, Tk * Tp);
|
||||
Tu = ri[WS(rs, 12)];
|
||||
Ty = ii[WS(rs, 12)];
|
||||
Tz = FMA(Tt, Tu, Tx * Ty);
|
||||
T1P = FNMS(Tx, Tu, Tt * Ty);
|
||||
}
|
||||
TA = Tq + Tz;
|
||||
T3s = Tq - Tz;
|
||||
T1Q = T1O - T1P;
|
||||
T3b = T1O + T1P;
|
||||
}
|
||||
{
|
||||
E TG, T1S, TL, T1T, T1U, T1V;
|
||||
{
|
||||
E TD, TF, TI, TK;
|
||||
TD = ri[WS(rs, 2)];
|
||||
TF = ii[WS(rs, 2)];
|
||||
TG = FMA(TC, TD, TE * TF);
|
||||
T1S = FNMS(TE, TD, TC * TF);
|
||||
TI = ri[WS(rs, 10)];
|
||||
TK = ii[WS(rs, 10)];
|
||||
TL = FMA(TH, TI, TJ * TK);
|
||||
T1T = FNMS(TJ, TI, TH * TK);
|
||||
}
|
||||
TM = TG + TL;
|
||||
T2M = T1S + T1T;
|
||||
T1U = T1S - T1T;
|
||||
T1V = TG - TL;
|
||||
T1W = T1U - T1V;
|
||||
T2w = T1V + T1U;
|
||||
}
|
||||
{
|
||||
E TT, T1Y, TY, T1Z, T1X, T20;
|
||||
{
|
||||
E TQ, TS, TV, TX;
|
||||
TQ = ri[WS(rs, 14)];
|
||||
TS = ii[WS(rs, 14)];
|
||||
TT = FMA(TP, TQ, TR * TS);
|
||||
T1Y = FNMS(TR, TQ, TP * TS);
|
||||
TV = ri[WS(rs, 6)];
|
||||
TX = ii[WS(rs, 6)];
|
||||
TY = FMA(TU, TV, TW * TX);
|
||||
T1Z = FNMS(TW, TV, TU * TX);
|
||||
}
|
||||
TZ = TT + TY;
|
||||
T2N = T1Y + T1Z;
|
||||
T1X = TT - TY;
|
||||
T20 = T1Y - T1Z;
|
||||
T21 = T1X + T20;
|
||||
T2x = T1X - T20;
|
||||
}
|
||||
{
|
||||
E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
|
||||
{
|
||||
E T1p, T1q, T1G, T1I;
|
||||
T1p = ri[WS(rs, 15)];
|
||||
T1q = ii[WS(rs, 15)];
|
||||
T1r = FMA(TN, T1p, TO * T1q);
|
||||
T2k = FNMS(TO, T1p, TN * T1q);
|
||||
T1G = ri[WS(rs, 11)];
|
||||
T1I = ii[WS(rs, 11)];
|
||||
T1J = FMA(T1F, T1G, T1H * T1I);
|
||||
T2h = FNMS(T1H, T1G, T1F * T1I);
|
||||
}
|
||||
{
|
||||
E T1v, T1z, T1C, T1D;
|
||||
T1v = ri[WS(rs, 7)];
|
||||
T1z = ii[WS(rs, 7)];
|
||||
T1A = FMA(T1u, T1v, T1y * T1z);
|
||||
T2l = FNMS(T1y, T1v, T1u * T1z);
|
||||
T1C = ri[WS(rs, 3)];
|
||||
T1D = ii[WS(rs, 3)];
|
||||
T1E = FMA(Tg, T1C, Ti * T1D);
|
||||
T2g = FNMS(Ti, T1C, Tg * T1D);
|
||||
}
|
||||
T1B = T1r + T1A;
|
||||
T1K = T1E + T1J;
|
||||
T2V = T1B - T1K;
|
||||
T2W = T2k + T2l;
|
||||
T2X = T2g + T2h;
|
||||
T2Y = T2W - T2X;
|
||||
{
|
||||
E T2f, T2i, T2m, T2n;
|
||||
T2f = T1r - T1A;
|
||||
T2i = T2g - T2h;
|
||||
T2j = T2f - T2i;
|
||||
T2D = T2f + T2i;
|
||||
T2m = T2k - T2l;
|
||||
T2n = T1E - T1J;
|
||||
T2o = T2m + T2n;
|
||||
T2E = T2m - T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
|
||||
{
|
||||
E T12, T13, T1j, T1l;
|
||||
T12 = ri[WS(rs, 1)];
|
||||
T13 = ii[WS(rs, 1)];
|
||||
T14 = FMA(T2, T12, T5 * T13);
|
||||
T24 = FNMS(T5, T12, T2 * T13);
|
||||
T1j = ri[WS(rs, 13)];
|
||||
T1l = ii[WS(rs, 13)];
|
||||
T1m = FMA(T1i, T1j, T1k * T1l);
|
||||
T2b = FNMS(T1k, T1j, T1i * T1l);
|
||||
}
|
||||
{
|
||||
E T15, T16, T1c, T1g;
|
||||
T15 = ri[WS(rs, 9)];
|
||||
T16 = ii[WS(rs, 9)];
|
||||
T17 = FMA(T3, T15, T6 * T16);
|
||||
T25 = FNMS(T6, T15, T3 * T16);
|
||||
T1c = ri[WS(rs, 5)];
|
||||
T1g = ii[WS(rs, 5)];
|
||||
T1h = FMA(T1b, T1c, T1f * T1g);
|
||||
T2a = FNMS(T1f, T1c, T1b * T1g);
|
||||
}
|
||||
T18 = T14 + T17;
|
||||
T1n = T1h + T1m;
|
||||
T2Q = T18 - T1n;
|
||||
T2R = T24 + T25;
|
||||
T2S = T2a + T2b;
|
||||
T2T = T2R - T2S;
|
||||
{
|
||||
E T26, T27, T29, T2c;
|
||||
T26 = T24 - T25;
|
||||
T27 = T1h - T1m;
|
||||
T28 = T26 + T27;
|
||||
T2A = T26 - T27;
|
||||
T29 = T14 - T17;
|
||||
T2c = T2a - T2b;
|
||||
T2d = T29 - T2c;
|
||||
T2B = T29 + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
|
||||
{
|
||||
E T1R, T22, T3y, T3z;
|
||||
T1R = T1N - T1Q;
|
||||
T22 = KP707106781 * (T1W - T21);
|
||||
T23 = T1R + T22;
|
||||
T2r = T1R - T22;
|
||||
T3y = KP707106781 * (T2x - T2w);
|
||||
T3z = T3s + T3r;
|
||||
T3A = T3y + T3z;
|
||||
T3C = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T2e, T2p, T2s, T2t;
|
||||
T2e = FMA(KP923879532, T28, KP382683432 * T2d);
|
||||
T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
|
||||
T2q = T2e + T2p;
|
||||
T3B = T2p - T2e;
|
||||
T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
|
||||
T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
|
||||
T2u = T2s - T2t;
|
||||
T3x = T2s + T2t;
|
||||
}
|
||||
ri[WS(rs, 11)] = T23 - T2q;
|
||||
ii[WS(rs, 11)] = T3A - T3x;
|
||||
ri[WS(rs, 3)] = T23 + T2q;
|
||||
ii[WS(rs, 3)] = T3x + T3A;
|
||||
ri[WS(rs, 15)] = T2r - T2u;
|
||||
ii[WS(rs, 15)] = T3C - T3B;
|
||||
ri[WS(rs, 7)] = T2r + T2u;
|
||||
ii[WS(rs, 7)] = T3B + T3C;
|
||||
}
|
||||
{
|
||||
E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
|
||||
{
|
||||
E T2L, T2O, T3k, T3l;
|
||||
T2L = Tf - TA;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2L + T2O;
|
||||
T31 = T2L - T2O;
|
||||
T3k = TZ - TM;
|
||||
T3l = T3e - T3b;
|
||||
T3m = T3k + T3l;
|
||||
T3o = T3l - T3k;
|
||||
}
|
||||
{
|
||||
E T2U, T2Z, T32, T33;
|
||||
T2U = T2Q + T2T;
|
||||
T2Z = T2V - T2Y;
|
||||
T30 = KP707106781 * (T2U + T2Z);
|
||||
T3n = KP707106781 * (T2Z - T2U);
|
||||
T32 = T2T - T2Q;
|
||||
T33 = T2V + T2Y;
|
||||
T34 = KP707106781 * (T32 - T33);
|
||||
T3j = KP707106781 * (T32 + T33);
|
||||
}
|
||||
ri[WS(rs, 10)] = T2P - T30;
|
||||
ii[WS(rs, 10)] = T3m - T3j;
|
||||
ri[WS(rs, 2)] = T2P + T30;
|
||||
ii[WS(rs, 2)] = T3j + T3m;
|
||||
ri[WS(rs, 14)] = T31 - T34;
|
||||
ii[WS(rs, 14)] = T3o - T3n;
|
||||
ri[WS(rs, 6)] = T31 + T34;
|
||||
ii[WS(rs, 6)] = T3n + T3o;
|
||||
}
|
||||
{
|
||||
E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
|
||||
{
|
||||
E T2v, T2y, T3q, T3t;
|
||||
T2v = T1N + T1Q;
|
||||
T2y = KP707106781 * (T2w + T2x);
|
||||
T2z = T2v + T2y;
|
||||
T2H = T2v - T2y;
|
||||
T3q = KP707106781 * (T1W + T21);
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3q + T3t;
|
||||
T3w = T3t - T3q;
|
||||
}
|
||||
{
|
||||
E T2C, T2F, T2I, T2J;
|
||||
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
|
||||
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
|
||||
T2G = T2C + T2F;
|
||||
T3v = T2F - T2C;
|
||||
T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
|
||||
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
|
||||
T2K = T2I - T2J;
|
||||
T3p = T2I + T2J;
|
||||
}
|
||||
ri[WS(rs, 9)] = T2z - T2G;
|
||||
ii[WS(rs, 9)] = T3u - T3p;
|
||||
ri[WS(rs, 1)] = T2z + T2G;
|
||||
ii[WS(rs, 1)] = T3p + T3u;
|
||||
ri[WS(rs, 13)] = T2H - T2K;
|
||||
ii[WS(rs, 13)] = T3w - T3v;
|
||||
ri[WS(rs, 5)] = T2H + T2K;
|
||||
ii[WS(rs, 5)] = T3v + T3w;
|
||||
}
|
||||
{
|
||||
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
|
||||
{
|
||||
E TB, T10, T3a, T3f;
|
||||
TB = Tf + TA;
|
||||
T10 = TM + TZ;
|
||||
T11 = TB + T10;
|
||||
T35 = TB - T10;
|
||||
T3a = T2M + T2N;
|
||||
T3f = T3b + T3e;
|
||||
T3g = T3a + T3f;
|
||||
T3i = T3f - T3a;
|
||||
}
|
||||
{
|
||||
E T1o, T1L, T36, T37;
|
||||
T1o = T18 + T1n;
|
||||
T1L = T1B + T1K;
|
||||
T1M = T1o + T1L;
|
||||
T3h = T1L - T1o;
|
||||
T36 = T2R + T2S;
|
||||
T37 = T2W + T2X;
|
||||
T38 = T36 - T37;
|
||||
T39 = T36 + T37;
|
||||
}
|
||||
ri[WS(rs, 8)] = T11 - T1M;
|
||||
ii[WS(rs, 8)] = T3g - T39;
|
||||
ri[0] = T11 + T1M;
|
||||
ii[0] = T39 + T3g;
|
||||
ri[WS(rs, 12)] = T35 - T38;
|
||||
ii[WS(rs, 12)] = T3i - T3h;
|
||||
ri[WS(rs, 4)] = T35 + T38;
|
||||
ii[WS(rs, 4)] = T3h + T3i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_CEXP, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 156, 68, 40, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_16, &desc);
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T2, T6, T3, T5, T7, Tb, T4, Ta;
|
||||
T2 = W[0];
|
||||
T6 = W[3];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Ta = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
{
|
||||
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
|
||||
T1 = ri[0];
|
||||
Tx = ii[0];
|
||||
{
|
||||
E T8, T9, Tc, Tv;
|
||||
T8 = ri[WS(rs, 2)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ii[WS(rs, 2)];
|
||||
Tv = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Tw = FNMS(Tb, T8, Tv);
|
||||
}
|
||||
{
|
||||
E Tf, Tg, Th, Tp;
|
||||
Tf = ri[WS(rs, 1)];
|
||||
Tg = T2 * Tf;
|
||||
Th = ii[WS(rs, 1)];
|
||||
Tp = T2 * Th;
|
||||
Ti = FMA(T5, Th, Tg);
|
||||
Tq = FNMS(T5, Tf, Tp);
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tl, Tr;
|
||||
Tj = ri[WS(rs, 3)];
|
||||
Tk = T3 * Tj;
|
||||
Tl = ii[WS(rs, 3)];
|
||||
Tr = T3 * Tl;
|
||||
Tm = FMA(T6, Tl, Tk);
|
||||
Ts = FNMS(T6, Tj, Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tn, Tu, Ty;
|
||||
Te = T1 + Td;
|
||||
Tn = Ti + Tm;
|
||||
ri[WS(rs, 2)] = Te - Tn;
|
||||
ri[0] = Te + Tn;
|
||||
Tu = Tq + Ts;
|
||||
Ty = Tw + Tx;
|
||||
ii[0] = Tu + Ty;
|
||||
ii[WS(rs, 2)] = Ty - Tu;
|
||||
}
|
||||
{
|
||||
E To, Tt, Tz, TA;
|
||||
To = T1 - Td;
|
||||
Tt = Tq - Ts;
|
||||
ri[WS(rs, 3)] = To - Tt;
|
||||
ri[WS(rs, 1)] = To + Tt;
|
||||
Tz = Tx - Tw;
|
||||
TA = Ti - Tm;
|
||||
ii[WS(rs, 1)] = Tz - TA;
|
||||
ii[WS(rs, 3)] = TA + Tz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T2, T4, T3, T5, T6, T8;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T3 = W[2];
|
||||
T5 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T8 = FNMS(T4, T3, T2 * T5);
|
||||
{
|
||||
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
|
||||
T1 = ri[0];
|
||||
Tp = ii[0];
|
||||
T7 = ri[WS(rs, 2)];
|
||||
T9 = ii[WS(rs, 2)];
|
||||
Ta = FMA(T6, T7, T8 * T9);
|
||||
To = FNMS(T8, T7, T6 * T9);
|
||||
{
|
||||
E Tc, Td, Tf, Tg;
|
||||
Tc = ri[WS(rs, 1)];
|
||||
Td = ii[WS(rs, 1)];
|
||||
Te = FMA(T2, Tc, T4 * Td);
|
||||
Tk = FNMS(T4, Tc, T2 * Td);
|
||||
Tf = ri[WS(rs, 3)];
|
||||
Tg = ii[WS(rs, 3)];
|
||||
Th = FMA(T3, Tf, T5 * Tg);
|
||||
Tl = FNMS(T5, Tf, T3 * Tg);
|
||||
}
|
||||
{
|
||||
E Tb, Ti, Tn, Tq;
|
||||
Tb = T1 + Ta;
|
||||
Ti = Te + Th;
|
||||
ri[WS(rs, 2)] = Tb - Ti;
|
||||
ri[0] = Tb + Ti;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
ii[0] = Tn + Tq;
|
||||
ii[WS(rs, 2)] = Tq - Tn;
|
||||
}
|
||||
{
|
||||
E Tj, Tm, Tr, Ts;
|
||||
Tj = T1 - Ta;
|
||||
Tm = Tk - Tl;
|
||||
ri[WS(rs, 3)] = Tj - Tm;
|
||||
ri[WS(rs, 1)] = Tj + Tm;
|
||||
Tr = Tp - To;
|
||||
Ts = Te - Th;
|
||||
ii[WS(rs, 1)] = Tr - Ts;
|
||||
ii[WS(rs, 3)] = Ts + Tr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_4, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,264 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 40 FP multiplications,
|
||||
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
|
||||
* 38 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
|
||||
T2 = W[0];
|
||||
Ta = W[3];
|
||||
T8 = W[2];
|
||||
T9 = T2 * T8;
|
||||
Te = T2 * Ta;
|
||||
T5 = W[1];
|
||||
Tb = FNMS(T5, Ta, T9);
|
||||
Tm = FNMS(T5, T8, Te);
|
||||
Tf = FMA(T5, T8, Te);
|
||||
Tj = FMA(T5, Ta, T9);
|
||||
{
|
||||
E T1, TO, T7, Th, Ti, Tz, TB, TL, To, Ts, Tt, TE, TG, TM;
|
||||
T1 = ri[0];
|
||||
TO = ii[0];
|
||||
{
|
||||
E T3, T4, T6, Ty, Tc, Td, Tg, TA;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T4 = T2 * T3;
|
||||
T6 = ii[WS(rs, 1)];
|
||||
Ty = T2 * T6;
|
||||
Tc = ri[WS(rs, 4)];
|
||||
Td = Tb * Tc;
|
||||
Tg = ii[WS(rs, 4)];
|
||||
TA = Tb * Tg;
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Th = FMA(Tf, Tg, Td);
|
||||
Ti = T7 + Th;
|
||||
Tz = FNMS(T5, T3, Ty);
|
||||
TB = FNMS(Tf, Tc, TA);
|
||||
TL = Tz + TB;
|
||||
}
|
||||
{
|
||||
E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
|
||||
Tk = ri[WS(rs, 2)];
|
||||
Tl = Tj * Tk;
|
||||
Tn = ii[WS(rs, 2)];
|
||||
TD = Tj * Tn;
|
||||
Tp = ri[WS(rs, 3)];
|
||||
Tq = T8 * Tp;
|
||||
Tr = ii[WS(rs, 3)];
|
||||
TF = T8 * Tr;
|
||||
To = FMA(Tm, Tn, Tl);
|
||||
Ts = FMA(Ta, Tr, Tq);
|
||||
Tt = To + Ts;
|
||||
TE = FNMS(Tm, Tk, TD);
|
||||
TG = FNMS(Ta, Tp, TF);
|
||||
TM = TE + TG;
|
||||
}
|
||||
{
|
||||
E Tw, Tu, Tv, TI, TK, TC, TH, TJ, Tx;
|
||||
Tw = Ti - Tt;
|
||||
Tu = Ti + Tt;
|
||||
Tv = FNMS(KP250000000, Tu, T1);
|
||||
TC = Tz - TB;
|
||||
TH = TE - TG;
|
||||
TI = FMA(KP618033988, TH, TC);
|
||||
TK = FNMS(KP618033988, TC, TH);
|
||||
ri[0] = T1 + Tu;
|
||||
TJ = FNMS(KP559016994, Tw, Tv);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ);
|
||||
Tx = FMA(KP559016994, Tw, Tv);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
|
||||
}
|
||||
{
|
||||
E TQ, TN, TP, TU, TW, TS, TT, TV, TR;
|
||||
TQ = TL - TM;
|
||||
TN = TL + TM;
|
||||
TP = FNMS(KP250000000, TN, TO);
|
||||
TS = T7 - Th;
|
||||
TT = To - Ts;
|
||||
TU = FMA(KP618033988, TT, TS);
|
||||
TW = FNMS(KP618033988, TS, TT);
|
||||
ii[0] = TN + TO;
|
||||
TV = FNMS(KP559016994, TQ, TP);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, TW, TV);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV);
|
||||
TR = FMA(KP559016994, TQ, TP);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, TU, TR);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 14, 10, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 32 FP multiplications,
|
||||
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 37 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
|
||||
{
|
||||
E T8, Te, Ta, Td;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
T8 = T2 * T7;
|
||||
Te = T4 * T7;
|
||||
Ta = T4 * T9;
|
||||
Td = T2 * T9;
|
||||
Tb = T8 - Ta;
|
||||
Tl = Td - Te;
|
||||
Tf = Td + Te;
|
||||
Tj = T8 + Ta;
|
||||
}
|
||||
{
|
||||
E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts;
|
||||
T1 = ri[0];
|
||||
TI = ii[0];
|
||||
{
|
||||
E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
|
||||
{
|
||||
E T3, T5, To, Tp;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Tw = FNMS(T4, T3, T2 * T5);
|
||||
To = ri[WS(rs, 3)];
|
||||
Tp = ii[WS(rs, 3)];
|
||||
Tq = FMA(T7, To, T9 * Tp);
|
||||
TA = FNMS(T9, To, T7 * Tp);
|
||||
}
|
||||
{
|
||||
E Tc, Tg, Tk, Tm;
|
||||
Tc = ri[WS(rs, 4)];
|
||||
Tg = ii[WS(rs, 4)];
|
||||
Th = FMA(Tb, Tc, Tf * Tg);
|
||||
Tx = FNMS(Tf, Tc, Tb * Tg);
|
||||
Tk = ri[WS(rs, 2)];
|
||||
Tm = ii[WS(rs, 2)];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
Tz = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ty = Tw - Tx;
|
||||
TB = Tz - TA;
|
||||
TN = Tn - Tq;
|
||||
TM = T6 - Th;
|
||||
TF = Tw + Tx;
|
||||
TG = Tz + TA;
|
||||
TH = TF + TG;
|
||||
Ti = T6 + Th;
|
||||
Tr = Tn + Tq;
|
||||
Ts = Ti + Tr;
|
||||
}
|
||||
ri[0] = T1 + Ts;
|
||||
ii[0] = TH + TI;
|
||||
{
|
||||
E TC, TE, Tv, TD, Tt, Tu;
|
||||
TC = FMA(KP951056516, Ty, KP587785252 * TB);
|
||||
TE = FNMS(KP587785252, Ty, KP951056516 * TB);
|
||||
Tt = KP559016994 * (Ti - Tr);
|
||||
Tu = FNMS(KP250000000, Ts, T1);
|
||||
Tv = Tt + Tu;
|
||||
TD = Tu - Tt;
|
||||
ri[WS(rs, 4)] = Tv - TC;
|
||||
ri[WS(rs, 3)] = TD + TE;
|
||||
ri[WS(rs, 1)] = Tv + TC;
|
||||
ri[WS(rs, 2)] = TD - TE;
|
||||
}
|
||||
{
|
||||
E TO, TP, TL, TQ, TJ, TK;
|
||||
TO = FMA(KP951056516, TM, KP587785252 * TN);
|
||||
TP = FNMS(KP587785252, TM, KP951056516 * TN);
|
||||
TJ = KP559016994 * (TF - TG);
|
||||
TK = FNMS(KP250000000, TH, TI);
|
||||
TL = TJ + TK;
|
||||
TQ = TK - TJ;
|
||||
ii[WS(rs, 1)] = TL - TO;
|
||||
ii[WS(rs, 3)] = TQ - TP;
|
||||
ii[WS(rs, 4)] = TO + TL;
|
||||
ii[WS(rs, 2)] = TP + TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 30, 18, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_5, &desc);
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,390 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 50 FP multiplications,
|
||||
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
|
||||
* 48 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
|
||||
{
|
||||
E T4, Tm, Tr, Ta, TB, TF;
|
||||
T2 = W[0];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tl = W[4];
|
||||
Tm = T2 * Tl;
|
||||
Tn = W[5];
|
||||
Tr = T2 * Tn;
|
||||
T5 = W[1];
|
||||
T6 = W[3];
|
||||
Ta = T2 * T6;
|
||||
Tf = FMA(T5, T6, T4);
|
||||
T7 = FNMS(T5, T6, T4);
|
||||
Ts = FNMS(T5, Tl, Tr);
|
||||
Tb = FMA(T5, T3, Ta);
|
||||
To = FMA(T5, Tn, Tm);
|
||||
TB = Tf * Tl;
|
||||
TF = Tf * Tn;
|
||||
Ti = FNMS(T5, T3, Ta);
|
||||
TC = FMA(Ti, Tn, TB);
|
||||
TG = FNMS(Ti, Tl, TF);
|
||||
}
|
||||
{
|
||||
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
|
||||
E TI, T11, T13, T15, T16;
|
||||
T1 = ri[0];
|
||||
T1s = ii[0];
|
||||
{
|
||||
E T8, T9, Tc, T1q;
|
||||
T8 = ri[WS(rs, 4)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ii[WS(rs, 4)];
|
||||
T1q = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
T1r = FNMS(Tb, T8, T1q);
|
||||
}
|
||||
{
|
||||
E Tp, Tq, Tt, TX;
|
||||
Tp = ri[WS(rs, 6)];
|
||||
Tq = To * Tp;
|
||||
Tt = ii[WS(rs, 6)];
|
||||
TX = To * Tt;
|
||||
Tu = FMA(Ts, Tt, Tq);
|
||||
TY = FNMS(Ts, Tp, TX);
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, TV;
|
||||
Tg = ri[WS(rs, 2)];
|
||||
Th = Tf * Tg;
|
||||
Tj = ii[WS(rs, 2)];
|
||||
TV = Tf * Tj;
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TW = FNMS(Ti, Tg, TV);
|
||||
}
|
||||
{
|
||||
E TK, TL, TM, T19, TO, TP, TQ, T1b;
|
||||
TK = ri[WS(rs, 7)];
|
||||
TL = Tl * TK;
|
||||
TM = ii[WS(rs, 7)];
|
||||
T19 = Tl * TM;
|
||||
TO = ri[WS(rs, 3)];
|
||||
TP = T3 * TO;
|
||||
TQ = ii[WS(rs, 3)];
|
||||
T1b = T3 * TQ;
|
||||
TN = FMA(Tn, TM, TL);
|
||||
TR = FMA(T6, TQ, TP);
|
||||
T18 = TN - TR;
|
||||
T1a = FNMS(Tn, TK, T19);
|
||||
T1c = FNMS(T6, TO, T1b);
|
||||
T1d = T1a - T1c;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
|
||||
Tx = ri[WS(rs, 1)];
|
||||
Ty = T2 * Tx;
|
||||
Tz = ii[WS(rs, 1)];
|
||||
T12 = T2 * Tz;
|
||||
TD = ri[WS(rs, 5)];
|
||||
TE = TC * TD;
|
||||
TH = ii[WS(rs, 5)];
|
||||
T14 = TC * TH;
|
||||
TA = FMA(T5, Tz, Ty);
|
||||
TI = FMA(TG, TH, TE);
|
||||
T11 = TA - TI;
|
||||
T13 = FNMS(T5, Tx, T12);
|
||||
T15 = FNMS(TG, TD, T14);
|
||||
T16 = T13 - T15;
|
||||
}
|
||||
{
|
||||
E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
|
||||
{
|
||||
E TU, TZ, T1x, T1y;
|
||||
TU = T1 - Td;
|
||||
TZ = TW - TY;
|
||||
T10 = TU + TZ;
|
||||
T1g = TU - TZ;
|
||||
T1x = T1s - T1r;
|
||||
T1y = Tk - Tu;
|
||||
T1z = T1x - T1y;
|
||||
T1B = T1y + T1x;
|
||||
}
|
||||
{
|
||||
E T17, T1e, T1h, T1i;
|
||||
T17 = T11 + T16;
|
||||
T1e = T18 - T1d;
|
||||
T1f = T17 + T1e;
|
||||
T1C = T1e - T17;
|
||||
T1h = T16 - T11;
|
||||
T1i = T18 + T1d;
|
||||
T1j = T1h - T1i;
|
||||
T1A = T1h + T1i;
|
||||
}
|
||||
ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
|
||||
ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
|
||||
ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
|
||||
ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
|
||||
ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
|
||||
ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
|
||||
ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
|
||||
ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
|
||||
}
|
||||
{
|
||||
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
|
||||
{
|
||||
E Te, Tv, T1p, T1t;
|
||||
Te = T1 + Td;
|
||||
Tv = Tk + Tu;
|
||||
Tw = Te + Tv;
|
||||
T1k = Te - Tv;
|
||||
T1p = TW + TY;
|
||||
T1t = T1r + T1s;
|
||||
T1u = T1p + T1t;
|
||||
T1w = T1t - T1p;
|
||||
}
|
||||
{
|
||||
E TJ, TS, T1l, T1m;
|
||||
TJ = TA + TI;
|
||||
TS = TN + TR;
|
||||
TT = TJ + TS;
|
||||
T1v = TS - TJ;
|
||||
T1l = T13 + T15;
|
||||
T1m = T1a + T1c;
|
||||
T1n = T1l - T1m;
|
||||
T1o = T1l + T1m;
|
||||
}
|
||||
ri[WS(rs, 4)] = Tw - TT;
|
||||
ii[WS(rs, 4)] = T1u - T1o;
|
||||
ri[0] = Tw + TT;
|
||||
ii[0] = T1o + T1u;
|
||||
ri[WS(rs, 6)] = T1k - T1n;
|
||||
ii[WS(rs, 6)] = T1w - T1v;
|
||||
ri[WS(rs, 2)] = T1k + T1n;
|
||||
ii[WS(rs, 2)] = T1v + T1w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 44, 20, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 44 FP multiplications,
|
||||
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
|
||||
* 42 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
|
||||
{
|
||||
E T4, Tb, T7, Ta;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
T3 = W[2];
|
||||
T6 = W[3];
|
||||
T4 = T2 * T3;
|
||||
Tb = T5 * T3;
|
||||
T7 = T5 * T6;
|
||||
Ta = T2 * T6;
|
||||
T8 = T4 - T7;
|
||||
Tc = Ta + Tb;
|
||||
Tg = T4 + T7;
|
||||
Ti = Ta - Tb;
|
||||
Tl = W[4];
|
||||
Tm = W[5];
|
||||
Tn = FMA(T2, Tl, T5 * Tm);
|
||||
Tz = FNMS(Ti, Tl, Tg * Tm);
|
||||
Tp = FNMS(T5, Tl, T2 * Tm);
|
||||
Tx = FMA(Tg, Tl, Ti * Tm);
|
||||
}
|
||||
{
|
||||
E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
|
||||
E TT;
|
||||
{
|
||||
E T1, T1c, Te, T1b, T9, Td;
|
||||
T1 = ri[0];
|
||||
T1c = ii[0];
|
||||
T9 = ri[WS(rs, 4)];
|
||||
Td = ii[WS(rs, 4)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T1b = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T1i = T1c - T1b;
|
||||
TL = T1 - Te;
|
||||
T1d = T1b + T1c;
|
||||
}
|
||||
{
|
||||
E TF, TW, TI, TX;
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = ri[WS(rs, 7)];
|
||||
TE = ii[WS(rs, 7)];
|
||||
TF = FMA(Tl, TD, Tm * TE);
|
||||
TW = FNMS(Tm, TD, Tl * TE);
|
||||
TG = ri[WS(rs, 3)];
|
||||
TH = ii[WS(rs, 3)];
|
||||
TI = FMA(T3, TG, T6 * TH);
|
||||
TX = FNMS(T6, TG, T3 * TH);
|
||||
}
|
||||
TJ = TF + TI;
|
||||
T17 = TW + TX;
|
||||
TV = TF - TI;
|
||||
TY = TW - TX;
|
||||
}
|
||||
{
|
||||
E Tk, TM, Tr, TN;
|
||||
{
|
||||
E Th, Tj, To, Tq;
|
||||
Th = ri[WS(rs, 2)];
|
||||
Tj = ii[WS(rs, 2)];
|
||||
Tk = FMA(Tg, Th, Ti * Tj);
|
||||
TM = FNMS(Ti, Th, Tg * Tj);
|
||||
To = ri[WS(rs, 6)];
|
||||
Tq = ii[WS(rs, 6)];
|
||||
Tr = FMA(Tn, To, Tp * Tq);
|
||||
TN = FNMS(Tp, To, Tn * Tq);
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
T1j = Tk - Tr;
|
||||
TO = TM - TN;
|
||||
T1a = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tw, TR, TB, TS;
|
||||
{
|
||||
E Tu, Tv, Ty, TA;
|
||||
Tu = ri[WS(rs, 1)];
|
||||
Tv = ii[WS(rs, 1)];
|
||||
Tw = FMA(T2, Tu, T5 * Tv);
|
||||
TR = FNMS(T5, Tu, T2 * Tv);
|
||||
Ty = ri[WS(rs, 5)];
|
||||
TA = ii[WS(rs, 5)];
|
||||
TB = FMA(Tx, Ty, Tz * TA);
|
||||
TS = FNMS(Tz, Ty, Tx * TA);
|
||||
}
|
||||
TC = Tw + TB;
|
||||
T16 = TR + TS;
|
||||
TQ = Tw - TB;
|
||||
TT = TR - TS;
|
||||
}
|
||||
{
|
||||
E Tt, TK, T1f, T1g;
|
||||
Tt = Tf + Ts;
|
||||
TK = TC + TJ;
|
||||
ri[WS(rs, 4)] = Tt - TK;
|
||||
ri[0] = Tt + TK;
|
||||
{
|
||||
E T19, T1e, T15, T18;
|
||||
T19 = T16 + T17;
|
||||
T1e = T1a + T1d;
|
||||
ii[0] = T19 + T1e;
|
||||
ii[WS(rs, 4)] = T1e - T19;
|
||||
T15 = Tf - Ts;
|
||||
T18 = T16 - T17;
|
||||
ri[WS(rs, 6)] = T15 - T18;
|
||||
ri[WS(rs, 2)] = T15 + T18;
|
||||
}
|
||||
T1f = TJ - TC;
|
||||
T1g = T1d - T1a;
|
||||
ii[WS(rs, 2)] = T1f + T1g;
|
||||
ii[WS(rs, 6)] = T1g - T1f;
|
||||
{
|
||||
E T11, T1k, T14, T1h, T12, T13;
|
||||
T11 = TL - TO;
|
||||
T1k = T1i - T1j;
|
||||
T12 = TT - TQ;
|
||||
T13 = TV + TY;
|
||||
T14 = KP707106781 * (T12 - T13);
|
||||
T1h = KP707106781 * (T12 + T13);
|
||||
ri[WS(rs, 7)] = T11 - T14;
|
||||
ii[WS(rs, 5)] = T1k - T1h;
|
||||
ri[WS(rs, 3)] = T11 + T14;
|
||||
ii[WS(rs, 1)] = T1h + T1k;
|
||||
}
|
||||
{
|
||||
E TP, T1m, T10, T1l, TU, TZ;
|
||||
TP = TL + TO;
|
||||
T1m = T1j + T1i;
|
||||
TU = TQ + TT;
|
||||
TZ = TV - TY;
|
||||
T10 = KP707106781 * (TU + TZ);
|
||||
T1l = KP707106781 * (TZ - TU);
|
||||
ri[WS(rs, 5)] = TP - T10;
|
||||
ii[WS(rs, 7)] = T1m - T1l;
|
||||
ri[WS(rs, 1)] = TP + T10;
|
||||
ii[WS(rs, 3)] = T1l + T1m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 56, 26, 18, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_8, &desc);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1 @@
|
||||
#include "dft/scalar/t.h" /* same stuff, no need to duplicate */
|
||||
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static int okp(const kdft_desc *d,
|
||||
const R *ri, const R *ii,
|
||||
const R *ro, const R *io,
|
||||
INT is, INT os, INT vl, INT ivs, INT ovs,
|
||||
const planner *plnr)
|
||||
{
|
||||
UNUSED(ri); UNUSED(ii); UNUSED(ro); UNUSED(io); UNUSED(vl); UNUSED(plnr);
|
||||
return (1
|
||||
&& (!d->is || (d->is == is))
|
||||
&& (!d->os || (d->os == os))
|
||||
&& (!d->ivs || (d->ivs == ivs))
|
||||
&& (!d->ovs || (d->ovs == ovs))
|
||||
);
|
||||
}
|
||||
|
||||
const kdft_genus GENUS = { okp, 1 };
|
||||
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(dft_n_genus)
|
||||
extern const kdft_genus GENUS;
|
||||
@@ -0,0 +1 @@
|
||||
#include "dft/scalar/t.h" /* same stuff, no need to duplicate */
|
||||
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static int okp(const ct_desc *d,
|
||||
const R *rio, const R *iio,
|
||||
INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
||||
const planner *plnr)
|
||||
{
|
||||
UNUSED(rio); UNUSED(iio); UNUSED(m); UNUSED(mb); UNUSED(me); UNUSED(plnr);
|
||||
return (1
|
||||
&& (!d->rs || (d->rs == rs))
|
||||
&& (!d->vs || (d->vs == vs))
|
||||
&& (!d->ms || (d->ms == ms))
|
||||
);
|
||||
}
|
||||
|
||||
const ct_genus GENUS = { okp, 1 };
|
||||
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(dft_t_genus)
|
||||
extern const ct_genus GENUS;
|
||||
@@ -0,0 +1,4 @@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
|
||||
EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h \
|
||||
t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
|
||||
@@ -0,0 +1,666 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = dft/simd
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
SOURCES =
|
||||
DIST_SOURCES =
|
||||
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
|
||||
ctags-recursive dvi-recursive html-recursive info-recursive \
|
||||
install-data-recursive install-dvi-recursive \
|
||||
install-exec-recursive install-html-recursive \
|
||||
install-info-recursive install-pdf-recursive \
|
||||
install-ps-recursive install-recursive installcheck-recursive \
|
||||
installdirs-recursive pdf-recursive ps-recursive \
|
||||
tags-recursive uninstall-recursive
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
|
||||
distclean-recursive maintainer-clean-recursive
|
||||
am__recursive_targets = \
|
||||
$(RECURSIVE_TARGETS) \
|
||||
$(RECURSIVE_CLEAN_TARGETS) \
|
||||
$(am__extra_recursive_targets)
|
||||
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
|
||||
distdir distdir-am
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
DIST_SUBDIRS = $(SUBDIRS)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
am__relativize = \
|
||||
dir0=`pwd`; \
|
||||
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
|
||||
sed_rest='s,^[^/]*/*,,'; \
|
||||
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
|
||||
sed_butlast='s,/*[^/]*$$,,'; \
|
||||
while test -n "$$dir1"; do \
|
||||
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
|
||||
if test "$$first" != "."; then \
|
||||
if test "$$first" = ".."; then \
|
||||
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
|
||||
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
|
||||
else \
|
||||
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
|
||||
if test "$$first2" = "$$first"; then \
|
||||
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
|
||||
else \
|
||||
dir2="../$$dir2"; \
|
||||
fi; \
|
||||
dir0="$$dir0"/"$$first"; \
|
||||
fi; \
|
||||
fi; \
|
||||
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
|
||||
done; \
|
||||
reldir="$$dir2"
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
|
||||
EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h \
|
||||
t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
|
||||
|
||||
all: all-recursive
|
||||
|
||||
.SUFFIXES:
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu dft/simd/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
# This directory's subdirectories are mostly independent; you can cd
|
||||
# into them and run 'make' without going through this Makefile.
|
||||
# To change the values of 'make' variables: instead of editing Makefiles,
|
||||
# (1) if the variable is set in 'config.status', edit 'config.status'
|
||||
# (which will cause the Makefiles to be regenerated when you run 'make');
|
||||
# (2) otherwise, pass the desired values on the 'make' command line.
|
||||
$(am__recursive_targets):
|
||||
@fail=; \
|
||||
if $(am__make_keepgoing); then \
|
||||
failcom='fail=yes'; \
|
||||
else \
|
||||
failcom='exit 1'; \
|
||||
fi; \
|
||||
dot_seen=no; \
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
case "$@" in \
|
||||
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
|
||||
*) list='$(SUBDIRS)' ;; \
|
||||
esac; \
|
||||
for subdir in $$list; do \
|
||||
echo "Making $$target in $$subdir"; \
|
||||
if test "$$subdir" = "."; then \
|
||||
dot_seen=yes; \
|
||||
local_target="$$target-am"; \
|
||||
else \
|
||||
local_target="$$target"; \
|
||||
fi; \
|
||||
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|
||||
|| eval $$failcom; \
|
||||
done; \
|
||||
if test "$$dot_seen" = "no"; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
|
||||
fi; test -z "$$fail"
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-recursive
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
|
||||
include_option=--etags-include; \
|
||||
empty_fix=.; \
|
||||
else \
|
||||
include_option=--include; \
|
||||
empty_fix=; \
|
||||
fi; \
|
||||
list='$(SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
test ! -f $$subdir/TAGS || \
|
||||
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
|
||||
fi; \
|
||||
done; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-recursive
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-recursive
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
$(am__make_dryrun) \
|
||||
|| test -d "$(distdir)/$$subdir" \
|
||||
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|
||||
|| exit 1; \
|
||||
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
|
||||
$(am__relativize); \
|
||||
new_distdir=$$reldir; \
|
||||
dir1=$$subdir; dir2="$(top_distdir)"; \
|
||||
$(am__relativize); \
|
||||
new_top_distdir=$$reldir; \
|
||||
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
|
||||
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
|
||||
($(am__cd) $$subdir && \
|
||||
$(MAKE) $(AM_MAKEFLAGS) \
|
||||
top_distdir="$$new_top_distdir" \
|
||||
distdir="$$new_distdir" \
|
||||
am__remove_distdir=: \
|
||||
am__skip_length_check=: \
|
||||
am__skip_mode_fix=: \
|
||||
distdir) \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-recursive
|
||||
all-am: Makefile
|
||||
installdirs: installdirs-recursive
|
||||
installdirs-am:
|
||||
install: install-recursive
|
||||
install-exec: install-exec-recursive
|
||||
install-data: install-data-recursive
|
||||
uninstall: uninstall-recursive
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-recursive
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-recursive
|
||||
|
||||
clean-am: clean-generic clean-libtool mostlyclean-am
|
||||
|
||||
distclean: distclean-recursive
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-generic distclean-tags
|
||||
|
||||
dvi: dvi-recursive
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-recursive
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-recursive
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-recursive
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-recursive
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-recursive
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-recursive
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-recursive
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-recursive
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-recursive
|
||||
|
||||
mostlyclean-am: mostlyclean-generic mostlyclean-libtool
|
||||
|
||||
pdf: pdf-recursive
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-recursive
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: $(am__recursive_targets) install-am install-strip
|
||||
|
||||
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
|
||||
check-am clean clean-generic clean-libtool cscopelist-am ctags \
|
||||
ctags-am distclean distclean-generic distclean-libtool \
|
||||
distclean-tags distdir dvi dvi-am html html-am info info-am \
|
||||
install install-am install-data install-data-am install-dvi \
|
||||
install-dvi-am install-exec install-exec-am install-html \
|
||||
install-html-am install-info install-info-am install-man \
|
||||
install-pdf install-pdf-am install-ps install-ps-am \
|
||||
install-strip installcheck installcheck-am installdirs \
|
||||
installdirs-am maintainer-clean maintainer-clean-generic \
|
||||
mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \
|
||||
ps ps-am tags tags-am uninstall uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
@@ -0,0 +1,13 @@
|
||||
AM_CFLAGS = $(ALTIVEC_CFLAGS)
|
||||
SIMD_HEADER=simd-support/simd-altivec.h
|
||||
|
||||
include $(top_srcdir)/dft/simd/codlist.mk
|
||||
include $(top_srcdir)/dft/simd/simd.mk
|
||||
|
||||
if HAVE_ALTIVEC
|
||||
|
||||
BUILT_SOURCES = $(EXTRA_DIST)
|
||||
noinst_LTLIBRARIES = libdft_altivec_codelets.la
|
||||
libdft_altivec_codelets_la_SOURCES = $(BUILT_SOURCES)
|
||||
|
||||
endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/codlist.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/genus.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_10.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_11.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_12.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_128.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_13.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_14.c"
|
||||
@@ -0,0 +1,3 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
#define SIMD_HEADER "simd-support/simd-altivec.h"
|
||||
#include "../common/n1bv_15.c"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user