This commit is contained in:
2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions
+10
View File
@@ -0,0 +1,10 @@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = scalar simd
noinst_LTLIBRARIES = libdft.la
libdft_la_SOURCES = bluestein.c buffered.c conf.c ct.c dftw-direct.c \
dftw-directsq.c dftw-generic.c dftw-genericbuf.c direct.c generic.c \
indirect.c indirect-transpose.c kdft-dif.c kdft-difsq.c kdft-dit.c \
kdft.c nop.c plan.c problem.c rader.c rank-geq2.c solve.c vrank-geq1.c \
zero.c codelet-dft.h ct.h dft.h
+844
View File
@@ -0,0 +1,844 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = dft
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
libdft_la_LIBADD =
am_libdft_la_OBJECTS = bluestein.lo buffered.lo conf.lo ct.lo \
dftw-direct.lo dftw-directsq.lo dftw-generic.lo \
dftw-genericbuf.lo direct.lo generic.lo indirect.lo \
indirect-transpose.lo kdft-dif.lo kdft-difsq.lo kdft-dit.lo \
kdft.lo nop.lo plan.lo problem.lo rader.lo rank-geq2.lo \
solve.lo vrank-geq1.lo zero.lo
libdft_la_OBJECTS = $(am_libdft_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/bluestein.Plo \
./$(DEPDIR)/buffered.Plo ./$(DEPDIR)/conf.Plo \
./$(DEPDIR)/ct.Plo ./$(DEPDIR)/dftw-direct.Plo \
./$(DEPDIR)/dftw-directsq.Plo ./$(DEPDIR)/dftw-generic.Plo \
./$(DEPDIR)/dftw-genericbuf.Plo ./$(DEPDIR)/direct.Plo \
./$(DEPDIR)/generic.Plo ./$(DEPDIR)/indirect-transpose.Plo \
./$(DEPDIR)/indirect.Plo ./$(DEPDIR)/kdft-dif.Plo \
./$(DEPDIR)/kdft-difsq.Plo ./$(DEPDIR)/kdft-dit.Plo \
./$(DEPDIR)/kdft.Plo ./$(DEPDIR)/nop.Plo ./$(DEPDIR)/plan.Plo \
./$(DEPDIR)/problem.Plo ./$(DEPDIR)/rader.Plo \
./$(DEPDIR)/rank-geq2.Plo ./$(DEPDIR)/solve.Plo \
./$(DEPDIR)/vrank-geq1.Plo ./$(DEPDIR)/zero.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(libdft_la_SOURCES)
DIST_SOURCES = $(libdft_la_SOURCES)
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
ctags-recursive dvi-recursive html-recursive info-recursive \
install-data-recursive install-dvi-recursive \
install-exec-recursive install-html-recursive \
install-info-recursive install-pdf-recursive \
install-ps-recursive install-recursive installcheck-recursive \
installdirs-recursive pdf-recursive ps-recursive \
tags-recursive uninstall-recursive
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
distclean-recursive maintainer-clean-recursive
am__recursive_targets = \
$(RECURSIVE_TARGETS) \
$(RECURSIVE_CLEAN_TARGETS) \
$(am__extra_recursive_targets)
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
distdir distdir-am
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
am__relativize = \
dir0=`pwd`; \
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
sed_rest='s,^[^/]*/*,,'; \
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
sed_butlast='s,/*[^/]*$$,,'; \
while test -n "$$dir1"; do \
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
if test "$$first" != "."; then \
if test "$$first" = ".."; then \
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
else \
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
if test "$$first2" = "$$first"; then \
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
else \
dir2="../$$dir2"; \
fi; \
dir0="$$dir0"/"$$first"; \
fi; \
fi; \
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
done; \
reldir="$$dir2"
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = scalar simd
noinst_LTLIBRARIES = libdft.la
libdft_la_SOURCES = bluestein.c buffered.c conf.c ct.c dftw-direct.c \
dftw-directsq.c dftw-generic.c dftw-genericbuf.c direct.c generic.c \
indirect.c indirect-transpose.c kdft-dif.c kdft-difsq.c kdft-dit.c \
kdft.c nop.c plan.c problem.c rader.c rank-geq2.c solve.c vrank-geq1.c \
zero.c codelet-dft.h ct.h dft.h
all: all-recursive
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu dft/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
libdft.la: $(libdft_la_OBJECTS) $(libdft_la_DEPENDENCIES) $(EXTRA_libdft_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(libdft_la_OBJECTS) $(libdft_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bluestein.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-direct.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-directsq.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-generic.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dftw-genericbuf.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/generic.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect-transpose.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-dif.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-difsq.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft-dit.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdft.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rader.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/zero.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
# This directory's subdirectories are mostly independent; you can cd
# into them and run 'make' without going through this Makefile.
# To change the values of 'make' variables: instead of editing Makefiles,
# (1) if the variable is set in 'config.status', edit 'config.status'
# (which will cause the Makefiles to be regenerated when you run 'make');
# (2) otherwise, pass the desired values on the 'make' command line.
$(am__recursive_targets):
@fail=; \
if $(am__make_keepgoing); then \
failcom='fail=yes'; \
else \
failcom='exit 1'; \
fi; \
dot_seen=no; \
target=`echo $@ | sed s/-recursive//`; \
case "$@" in \
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
*) list='$(SUBDIRS)' ;; \
esac; \
for subdir in $$list; do \
echo "Making $$target in $$subdir"; \
if test "$$subdir" = "."; then \
dot_seen=yes; \
local_target="$$target-am"; \
else \
local_target="$$target"; \
fi; \
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|| eval $$failcom; \
done; \
if test "$$dot_seen" = "no"; then \
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
fi; test -z "$$fail"
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-recursive
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
include_option=--etags-include; \
empty_fix=.; \
else \
include_option=--include; \
empty_fix=; \
fi; \
list='$(SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
test ! -f $$subdir/TAGS || \
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
fi; \
done; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-recursive
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-recursive
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
$(am__make_dryrun) \
|| test -d "$(distdir)/$$subdir" \
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|| exit 1; \
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
$(am__relativize); \
new_distdir=$$reldir; \
dir1=$$subdir; dir2="$(top_distdir)"; \
$(am__relativize); \
new_top_distdir=$$reldir; \
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
($(am__cd) $$subdir && \
$(MAKE) $(AM_MAKEFLAGS) \
top_distdir="$$new_top_distdir" \
distdir="$$new_distdir" \
am__remove_distdir=: \
am__skip_length_check=: \
am__skip_mode_fix=: \
distdir) \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-recursive
all-am: Makefile $(LTLIBRARIES)
installdirs: installdirs-recursive
installdirs-am:
install: install-recursive
install-exec: install-exec-recursive
install-data: install-data-recursive
uninstall: uninstall-recursive
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-recursive
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-recursive
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-recursive
-rm -f ./$(DEPDIR)/bluestein.Plo
-rm -f ./$(DEPDIR)/buffered.Plo
-rm -f ./$(DEPDIR)/conf.Plo
-rm -f ./$(DEPDIR)/ct.Plo
-rm -f ./$(DEPDIR)/dftw-direct.Plo
-rm -f ./$(DEPDIR)/dftw-directsq.Plo
-rm -f ./$(DEPDIR)/dftw-generic.Plo
-rm -f ./$(DEPDIR)/dftw-genericbuf.Plo
-rm -f ./$(DEPDIR)/direct.Plo
-rm -f ./$(DEPDIR)/generic.Plo
-rm -f ./$(DEPDIR)/indirect-transpose.Plo
-rm -f ./$(DEPDIR)/indirect.Plo
-rm -f ./$(DEPDIR)/kdft-dif.Plo
-rm -f ./$(DEPDIR)/kdft-difsq.Plo
-rm -f ./$(DEPDIR)/kdft-dit.Plo
-rm -f ./$(DEPDIR)/kdft.Plo
-rm -f ./$(DEPDIR)/nop.Plo
-rm -f ./$(DEPDIR)/plan.Plo
-rm -f ./$(DEPDIR)/problem.Plo
-rm -f ./$(DEPDIR)/rader.Plo
-rm -f ./$(DEPDIR)/rank-geq2.Plo
-rm -f ./$(DEPDIR)/solve.Plo
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
-rm -f ./$(DEPDIR)/zero.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-recursive
dvi-am:
html: html-recursive
html-am:
info: info-recursive
info-am:
install-data-am:
install-dvi: install-dvi-recursive
install-dvi-am:
install-exec-am:
install-html: install-html-recursive
install-html-am:
install-info: install-info-recursive
install-info-am:
install-man:
install-pdf: install-pdf-recursive
install-pdf-am:
install-ps: install-ps-recursive
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-recursive
-rm -f ./$(DEPDIR)/bluestein.Plo
-rm -f ./$(DEPDIR)/buffered.Plo
-rm -f ./$(DEPDIR)/conf.Plo
-rm -f ./$(DEPDIR)/ct.Plo
-rm -f ./$(DEPDIR)/dftw-direct.Plo
-rm -f ./$(DEPDIR)/dftw-directsq.Plo
-rm -f ./$(DEPDIR)/dftw-generic.Plo
-rm -f ./$(DEPDIR)/dftw-genericbuf.Plo
-rm -f ./$(DEPDIR)/direct.Plo
-rm -f ./$(DEPDIR)/generic.Plo
-rm -f ./$(DEPDIR)/indirect-transpose.Plo
-rm -f ./$(DEPDIR)/indirect.Plo
-rm -f ./$(DEPDIR)/kdft-dif.Plo
-rm -f ./$(DEPDIR)/kdft-difsq.Plo
-rm -f ./$(DEPDIR)/kdft-dit.Plo
-rm -f ./$(DEPDIR)/kdft.Plo
-rm -f ./$(DEPDIR)/nop.Plo
-rm -f ./$(DEPDIR)/plan.Plo
-rm -f ./$(DEPDIR)/problem.Plo
-rm -f ./$(DEPDIR)/rader.Plo
-rm -f ./$(DEPDIR)/rank-geq2.Plo
-rm -f ./$(DEPDIR)/solve.Plo
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
-rm -f ./$(DEPDIR)/zero.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-recursive
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-recursive
pdf-am:
ps: ps-recursive
ps-am:
uninstall-am:
.MAKE: $(am__recursive_targets) install-am install-strip
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
am--depfiles check check-am clean clean-generic clean-libtool \
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
distclean-compile distclean-generic distclean-libtool \
distclean-tags distdir dvi dvi-am html html-am info info-am \
install install-am install-data install-data-am install-dvi \
install-dvi-am install-exec install-exec-am install-html \
install-html-am install-info install-info-am install-man \
install-pdf install-pdf-am install-ps install-ps-am \
install-strip installcheck installcheck-am installdirs \
installdirs-am maintainer-clean maintainer-clean-generic \
mostlyclean mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
+250
View File
@@ -0,0 +1,250 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
typedef struct {
solver super;
} S;
typedef struct {
plan_dft super;
INT n; /* problem size */
INT nb; /* size of convolution */
R *w; /* lambda k . exp(2*pi*i*k^2/(2*n)) */
R *W; /* DFT(w) */
plan *cldf;
INT is, os;
} P;
static void bluestein_sequence(enum wakefulness wakefulness, INT n, R *w)
{
INT k, ksq, n2 = 2 * n;
triggen *t = X(mktriggen)(wakefulness, n2);
ksq = 0;
for (k = 0; k < n; ++k) {
t->cexp(t, ksq, w+2*k);
/* careful with overflow */
ksq += 2*k + 1; while (ksq > n2) ksq -= n2;
}
X(triggen_destroy)(t);
}
static void mktwiddle(enum wakefulness wakefulness, P *p)
{
INT i;
INT n = p->n, nb = p->nb;
R *w, *W;
E nbf = (E)nb;
p->w = w = (R *) MALLOC(2 * n * sizeof(R), TWIDDLES);
p->W = W = (R *) MALLOC(2 * nb * sizeof(R), TWIDDLES);
bluestein_sequence(wakefulness, n, w);
for (i = 0; i < nb; ++i)
W[2*i] = W[2*i+1] = K(0.0);
W[0] = w[0] / nbf;
W[1] = w[1] / nbf;
for (i = 1; i < n; ++i) {
W[2*i] = W[2*(nb-i)] = w[2*i] / nbf;
W[2*i+1] = W[2*(nb-i)+1] = w[2*i+1] / nbf;
}
{
plan_dft *cldf = (plan_dft *)p->cldf;
/* cldf must be awake */
cldf->apply(p->cldf, W, W+1, W, W+1);
}
}
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT i, n = ego->n, nb = ego->nb, is = ego->is, os = ego->os;
R *w = ego->w, *W = ego->W;
R *b = (R *) MALLOC(2 * nb * sizeof(R), BUFFERS);
/* multiply input by conjugate bluestein sequence */
for (i = 0; i < n; ++i) {
E xr = ri[i*is], xi = ii[i*is];
E wr = w[2*i], wi = w[2*i+1];
b[2*i] = xr * wr + xi * wi;
b[2*i+1] = xi * wr - xr * wi;
}
for (; i < nb; ++i) b[2*i] = b[2*i+1] = K(0.0);
/* convolution: FFT */
{
plan_dft *cldf = (plan_dft *)ego->cldf;
cldf->apply(ego->cldf, b, b+1, b, b+1);
}
/* convolution: pointwise multiplication */
for (i = 0; i < nb; ++i) {
E xr = b[2*i], xi = b[2*i+1];
E wr = W[2*i], wi = W[2*i+1];
b[2*i] = xi * wr + xr * wi;
b[2*i+1] = xr * wr - xi * wi;
}
/* convolution: IFFT by FFT with real/imag input/output swapped */
{
plan_dft *cldf = (plan_dft *)ego->cldf;
cldf->apply(ego->cldf, b, b+1, b, b+1);
}
/* multiply output by conjugate bluestein sequence */
for (i = 0; i < n; ++i) {
E xi = b[2*i], xr = b[2*i+1];
E wr = w[2*i], wi = w[2*i+1];
ro[i*os] = xr * wr + xi * wi;
io[i*os] = xi * wr - xr * wi;
}
X(ifree)(b);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldf, wakefulness);
switch (wakefulness) {
case SLEEPY:
X(ifree0)(ego->w); ego->w = 0;
X(ifree0)(ego->W); ego->W = 0;
break;
default:
A(!ego->w);
mktwiddle(wakefulness, ego);
break;
}
}
static int applicable(const solver *ego, const problem *p_,
const planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
UNUSED(ego);
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
/* FIXME: allow other sizes */
&& X(is_prime)(p->sz->dims[0].n)
/* FIXME: avoid infinite recursion of bluestein with itself.
This works because all factors in child problems are 2, 3, 5 */
&& p->sz->dims[0].n > 16
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > BLUESTEIN_MAX_SLOW)
);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldf);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *)ego_;
p->print(p, "(dft-bluestein-%D/%D%(%p%))",
ego->n, ego->nb, ego->cldf);
}
static INT choose_transform_size(INT minsz)
{
while (!X(factors_into_small_primes)(minsz))
++minsz;
return minsz;
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
P *pln;
INT n, nb;
plan *cldf = 0;
R *buf = (R *) 0;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if (!applicable(ego, p_, plnr))
return (plan *) 0;
n = p->sz->dims[0].n;
nb = choose_transform_size(2 * n - 1);
buf = (R *) MALLOC(2 * nb * sizeof(R), BUFFERS);
cldf = X(mkplan_f_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(nb, 2, 2),
X(mktensor_1d)(1, 0, 0),
buf, buf+1,
buf, buf+1),
NO_SLOW, 0, 0);
if (!cldf) goto nada;
X(ifree)(buf);
pln = MKPLAN_DFT(P, &padt, apply);
pln->n = n;
pln->nb = nb;
pln->w = 0;
pln->W = 0;
pln->cldf = cldf;
pln->is = p->sz->dims[0].is;
pln->os = p->sz->dims[0].os;
X(ops_add)(&cldf->ops, &cldf->ops, &pln->super.super.ops);
pln->super.super.ops.add += 4 * n + 2 * nb;
pln->super.super.ops.mul += 8 * n + 4 * nb;
pln->super.super.ops.other += 6 * (n + nb);
return &(pln->super.super);
nada:
X(ifree0)(buf);
X(plan_destroy_internal)(cldf);
return (plan *)0;
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(dft_bluestein_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}
+284
View File
@@ -0,0 +1,284 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
typedef struct {
solver super;
size_t maxnbuf_ndx;
} S;
static const INT maxnbufs[] = { 8, 256 };
typedef struct {
plan_dft super;
plan *cld, *cldcpy, *cldrest;
INT n, vl, nbuf, bufdist;
INT ivs_by_nbuf, ovs_by_nbuf;
INT roffset, ioffset;
} P;
/* transform a vector input with the help of bufs */
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT nbuf = ego->nbuf;
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist * 2, BUFFERS);
plan_dft *cld = (plan_dft *) ego->cld;
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
plan_dft *cldrest;
INT i, vl = ego->vl;
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
INT roffset = ego->roffset, ioffset = ego->ioffset;
for (i = nbuf; i <= vl; i += nbuf) {
/* transform to bufs: */
cld->apply((plan *) cld, ri, ii, bufs + roffset, bufs + ioffset);
ri += ivs_by_nbuf; ii += ivs_by_nbuf;
/* copy back */
cldcpy->apply((plan *) cldcpy, bufs+roffset, bufs+ioffset, ro, io);
ro += ovs_by_nbuf; io += ovs_by_nbuf;
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_dft *) ego->cldrest;
cldrest->apply((plan *) cldrest, ri, ii, ro, io);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldcpy, wakefulness);
X(plan_awake)(ego->cldrest, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldrest);
X(plan_destroy_internal)(ego->cldcpy);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
ego->n, ego->nbuf,
ego->vl, ego->bufdist % ego->n,
ego->cld, ego->cldcpy, ego->cldrest);
}
static int applicable0(const S *ego, const problem *p_, const planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
const iodim *d = p->sz->dims;
if (1
&& p->vecsz->rnk <= 1
&& p->sz->rnk == 1
) {
INT vl, ivs, ovs;
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
if (X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
return 0;
/* if this solver is redundant, in the sense that a solver
of lower index generates the same plan, then prune this
solver */
if (X(nbuf_redundant)(d[0].n, vl,
ego->maxnbuf_ndx,
maxnbufs, NELEM(maxnbufs)))
return 0;
/*
In principle, the buffered transforms might be useful
when working out of place. However, in order to
prevent infinite loops in the planner, we require
that the output stride of the buffered transforms be
greater than 2.
*/
if (p->ri != p->ro)
return (d[0].os > 2);
/*
* If the problem is in place, the input/output strides must
* be the same or the whole thing must fit in the buffer.
*/
if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
return 1;
if (/* fits into buffer: */
((p->vecsz->rnk == 0)
||
(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
maxnbufs[ego->maxnbuf_ndx])
== p->vecsz->dims[0].n)))
return 1;
}
return 0;
}
static int applicable(const S *ego, const problem *p_, const planner *plnr)
{
if (NO_BUFFERINGP(plnr)) return 0;
if (!applicable0(ego, p_, plnr)) return 0;
if (NO_UGLYP(plnr)) {
const problem_dft *p = (const problem_dft *) p_;
if (p->ri != p->ro) return 0;
if (X(toobig)(p->sz->dims[0].n)) return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
P *pln;
const S *ego = (const S *)ego_;
plan *cld = (plan *) 0;
plan *cldcpy = (plan *) 0;
plan *cldrest = (plan *) 0;
const problem_dft *p = (const problem_dft *) p_;
R *bufs = (R *) 0;
INT nbuf = 0, bufdist, n, vl;
INT ivs, ovs, roffset, ioffset;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if (!applicable(ego, p_, plnr))
goto nada;
n = X(tensor_sz)(p->sz);
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
bufdist = X(bufdist)(n, vl);
A(nbuf > 0);
/* attempt to keep real and imaginary part in the same order,
so as to allow optimizations in the the copy plan */
roffset = (p->ri - p->ii > 0) ? (INT)1 : (INT)0;
ioffset = 1 - roffset;
/* initial allocation for the purpose of planning */
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist * 2, BUFFERS);
/* allow destruction of input if problem is in place */
cld = X(mkplan_f_d)(plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(n, p->sz->dims[0].is, 2),
X(mktensor_1d)(nbuf, ivs, bufdist * 2),
TAINT(p->ri, ivs * nbuf),
TAINT(p->ii, ivs * nbuf),
bufs + roffset,
bufs + ioffset),
0, 0, (p->ri == p->ro) ? NO_DESTROY_INPUT : 0);
if (!cld)
goto nada;
/* copying back from the buffer is a rank-0 transform: */
cldcpy = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(mktensor_0d)(),
X(mktensor_2d)(nbuf, bufdist * 2, ovs,
n, 2, p->sz->dims[0].os),
bufs + roffset,
bufs + ioffset,
TAINT(p->ro, ovs * nbuf),
TAINT(p->io, ovs * nbuf)));
if (!cldcpy)
goto nada;
/* deallocate buffers, let apply() allocate them for real */
X(ifree)(bufs);
bufs = 0;
/* plan the leftover transforms (cldrest): */
{
INT id = ivs * (nbuf * (vl / nbuf));
INT od = ovs * (nbuf * (vl / nbuf));
cldrest = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(tensor_copy)(p->sz),
X(mktensor_1d)(vl % nbuf, ivs, ovs),
p->ri+id, p->ii+id, p->ro+od, p->io+od));
}
if (!cldrest)
goto nada;
pln = MKPLAN_DFT(P, &padt, apply);
pln->cld = cld;
pln->cldcpy = cldcpy;
pln->cldrest = cldrest;
pln->n = n;
pln->vl = vl;
pln->ivs_by_nbuf = ivs * nbuf;
pln->ovs_by_nbuf = ovs * nbuf;
pln->roffset = roffset;
pln->ioffset = ioffset;
pln->nbuf = nbuf;
pln->bufdist = bufdist;
{
opcnt t;
X(ops_add)(&cld->ops, &cldcpy->ops, &t);
X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
}
return &(pln->super.super);
nada:
X(ifree0)(bufs);
X(plan_destroy_internal)(cldrest);
X(plan_destroy_internal)(cldcpy);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
static solver *mksolver(size_t maxnbuf_ndx)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->maxnbuf_ndx = maxnbuf_ndx;
return &(slv->super);
}
void X(dft_buffered_register)(planner *p)
{
size_t i;
for (i = 0; i < NELEM(maxnbufs); ++i)
REGISTER_SOLVER(p, mksolver(i));
}
+112
View File
@@ -0,0 +1,112 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/*
* This header file must include every file or define every
* type or macro which is required to compile a codelet.
*/
#ifndef __DFT_CODELET_H__
#define __DFT_CODELET_H__
#include "kernel/ifftw.h"
/**************************************************************
* types of codelets
**************************************************************/
/* DFT codelets */
typedef struct kdft_desc_s kdft_desc;
typedef struct {
int (*okp)(
const kdft_desc *desc,
const R *ri, const R *ii, const R *ro, const R *io,
INT is, INT os, INT vl, INT ivs, INT ovs,
const planner *plnr);
INT vl;
} kdft_genus;
struct kdft_desc_s {
INT sz; /* size of transform computed */
const char *nam;
opcnt ops;
const kdft_genus *genus;
INT is;
INT os;
INT ivs;
INT ovs;
};
typedef void (*kdft) (const R *ri, const R *ii, R *ro, R *io,
stride is, stride os, INT vl, INT ivs, INT ovs);
void X(kdft_register)(planner *p, kdft codelet, const kdft_desc *desc);
typedef struct ct_desc_s ct_desc;
typedef struct {
int (*okp)(
const struct ct_desc_s *desc,
const R *rio, const R *iio,
INT rs, INT vs, INT m, INT mb, INT me, INT ms,
const planner *plnr);
INT vl;
} ct_genus;
struct ct_desc_s {
INT radix;
const char *nam;
const tw_instr *tw;
const ct_genus *genus;
opcnt ops;
INT rs;
INT vs;
INT ms;
};
typedef void (*kdftw) (R *rioarray, R *iioarray, const R *W,
stride ios, INT mb, INT me, INT ms);
void X(kdft_dit_register)(planner *p, kdftw codelet, const ct_desc *desc);
void X(kdft_dif_register)(planner *p, kdftw codelet, const ct_desc *desc);
typedef void (*kdftwsq) (R *rioarray, R *iioarray,
const R *W, stride is, stride vs,
INT mb, INT me, INT ms);
void X(kdft_difsq_register)(planner *p, kdftwsq codelet, const ct_desc *desc);
extern const solvtab X(solvtab_dft_standard);
extern const solvtab X(solvtab_dft_sse2);
extern const solvtab X(solvtab_dft_avx);
extern const solvtab X(solvtab_dft_avx_128_fma);
extern const solvtab X(solvtab_dft_avx2);
extern const solvtab X(solvtab_dft_avx2_128);
extern const solvtab X(solvtab_dft_avx512);
extern const solvtab X(solvtab_dft_kcvi);
extern const solvtab X(solvtab_dft_altivec);
extern const solvtab X(solvtab_dft_vsx);
extern const solvtab X(solvtab_dft_neon);
extern const solvtab X(solvtab_dft_generic_simd128);
extern const solvtab X(solvtab_dft_generic_simd256);
#endif /* __DFT_CODELET_H__ */
+88
View File
@@ -0,0 +1,88 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
static const solvtab s =
{
SOLVTAB(X(dft_indirect_register)),
SOLVTAB(X(dft_indirect_transpose_register)),
SOLVTAB(X(dft_rank_geq2_register)),
SOLVTAB(X(dft_vrank_geq1_register)),
SOLVTAB(X(dft_buffered_register)),
SOLVTAB(X(dft_generic_register)),
SOLVTAB(X(dft_rader_register)),
SOLVTAB(X(dft_bluestein_register)),
SOLVTAB(X(dft_nop_register)),
SOLVTAB(X(ct_generic_register)),
SOLVTAB(X(ct_genericbuf_register)),
SOLVTAB_END
};
void X(dft_conf_standard)(planner *p)
{
X(solvtab_exec)(s, p);
X(solvtab_exec)(X(solvtab_dft_standard), p);
#if HAVE_SSE2
if (X(have_simd_sse2)())
X(solvtab_exec)(X(solvtab_dft_sse2), p);
#endif
#if HAVE_AVX
if (X(have_simd_avx)())
X(solvtab_exec)(X(solvtab_dft_avx), p);
#endif
#if HAVE_AVX_128_FMA
if (X(have_simd_avx_128_fma)())
X(solvtab_exec)(X(solvtab_dft_avx_128_fma), p);
#endif
#if HAVE_AVX2
if (X(have_simd_avx2)())
X(solvtab_exec)(X(solvtab_dft_avx2), p);
if (X(have_simd_avx2_128)())
X(solvtab_exec)(X(solvtab_dft_avx2_128), p);
#endif
#if HAVE_AVX512
if (X(have_simd_avx512)())
X(solvtab_exec)(X(solvtab_dft_avx512), p);
#endif
#if HAVE_KCVI
if (X(have_simd_kcvi)())
X(solvtab_exec)(X(solvtab_dft_kcvi), p);
#endif
#if HAVE_ALTIVEC
if (X(have_simd_altivec)())
X(solvtab_exec)(X(solvtab_dft_altivec), p);
#endif
#if HAVE_VSX
if (X(have_simd_vsx)())
X(solvtab_exec)(X(solvtab_dft_vsx), p);
#endif
#if HAVE_NEON
if (X(have_simd_neon)())
X(solvtab_exec)(X(solvtab_dft_neon), p);
#endif
#if HAVE_GENERIC_SIMD128
X(solvtab_exec)(X(solvtab_dft_generic_simd128), p);
#endif
#if HAVE_GENERIC_SIMD256
X(solvtab_exec)(X(solvtab_dft_generic_simd256), p);
#endif
}
+255
View File
@@ -0,0 +1,255 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/ct.h"
ct_solver *(*X(mksolver_ct_hook))(size_t, INT, int,
ct_mkinferior, ct_force_vrecursion) = 0;
typedef struct {
plan_dft super;
plan *cld;
plan *cldw;
INT r;
} P;
static void apply_dit(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
plan_dftw *cldw;
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, ri, ii, ro, io);
cldw = (plan_dftw *) ego->cldw;
cldw->apply(ego->cldw, ro, io);
}
static void apply_dif(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
plan_dftw *cldw;
cldw = (plan_dftw *) ego->cldw;
cldw->apply(ego->cldw, ri, ii);
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, ri, ii, ro, io);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldw, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldw);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dft-ct-%s/%D%(%p%)%(%p%))",
ego->super.apply == apply_dit ? "dit" : "dif",
ego->r, ego->cldw, ego->cld);
}
static int applicable0(const ct_solver *ego, const problem *p_, planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
INT r;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
/* DIF destroys the input and we don't like it */
&& (ego->dec == DECDIT ||
p->ri == p->ro ||
!NO_DESTROY_INPUTP(plnr))
&& ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 1)
&& p->sz->dims[0].n > r);
}
int X(ct_applicable)(const ct_solver *ego, const problem *p_, planner *plnr)
{
const problem_dft *p;
if (!applicable0(ego, p_, plnr))
return 0;
p = (const problem_dft *) p_;
return (0
|| ego->dec == DECDIF+TRANSPOSE
|| p->vecsz->rnk == 0
|| !NO_VRECURSEP(plnr)
|| (ego->force_vrecursionp && ego->force_vrecursionp(ego, p))
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const ct_solver *ego = (const ct_solver *) ego_;
const problem_dft *p;
P *pln = 0;
plan *cld = 0, *cldw = 0;
INT n, r, m, v, ivs, ovs;
iodim *d;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if ((NO_NONTHREADEDP(plnr)) || !X(ct_applicable)(ego, p_, plnr))
return (plan *) 0;
p = (const problem_dft *) p_;
d = p->sz->dims;
n = d[0].n;
r = X(choose_radix)(ego->r, n);
m = n / r;
X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
switch (ego->dec) {
case DECDIT:
{
cldw = ego->mkcldw(ego,
r, m * d[0].os, m * d[0].os,
m, d[0].os,
v, ovs, ovs,
0, m,
p->ro, p->io, plnr);
if (!cldw) goto nada;
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(m, r * d[0].is, d[0].os),
X(mktensor_2d)(r, d[0].is, m * d[0].os,
v, ivs, ovs),
p->ri, p->ii, p->ro, p->io)
);
if (!cld) goto nada;
pln = MKPLAN_DFT(P, &padt, apply_dit);
break;
}
case DECDIF:
case DECDIF+TRANSPOSE:
{
INT cors, covs; /* cldw ors, ovs */
if (ego->dec == DECDIF+TRANSPOSE) {
cors = ivs;
covs = m * d[0].is;
/* ensure that we generate well-formed dftw subproblems */
/* FIXME: too conservative */
if (!(1
&& r == v
&& d[0].is == r * cors))
goto nada;
/* FIXME: allow in-place only for now, like in
fftw-3.[01] */
if (!(1
&& p->ri == p->ro
&& d[0].is == r * d[0].os
&& cors == d[0].os
&& covs == ovs
))
goto nada;
} else {
cors = m * d[0].is;
covs = ivs;
}
cldw = ego->mkcldw(ego,
r, m * d[0].is, cors,
m, d[0].is,
v, ivs, covs,
0, m,
p->ri, p->ii, plnr);
if (!cldw) goto nada;
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(m, d[0].is, r * d[0].os),
X(mktensor_2d)(r, cors, d[0].os,
v, covs, ovs),
p->ri, p->ii, p->ro, p->io)
);
if (!cld) goto nada;
pln = MKPLAN_DFT(P, &padt, apply_dif);
break;
}
default: A(0);
}
pln->cld = cld;
pln->cldw = cldw;
pln->r = r;
X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
/* inherit could_prune_now_p attribute from cldw */
pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldw);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
ct_solver *X(mksolver_ct)(size_t size, INT r, int dec,
ct_mkinferior mkcldw,
ct_force_vrecursion force_vrecursionp)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
ct_solver *slv = (ct_solver *)X(mksolver)(size, &sadt);
slv->r = r;
slv->dec = dec;
slv->mkcldw = mkcldw;
slv->force_vrecursionp = force_vrecursionp;
return slv;
}
plan *X(mkplan_dftw)(size_t size, const plan_adt *adt, dftwapply apply)
{
plan_dftw *ego;
ego = (plan_dftw *) X(mkplan)(size, adt);
ego->apply = apply;
return &(ego->super);
}
+68
View File
@@ -0,0 +1,68 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
typedef void (*dftwapply)(const plan *ego, R *rio, R *iio);
typedef struct ct_solver_s ct_solver;
typedef plan *(*ct_mkinferior)(const ct_solver *ego,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mstart, INT mcount,
R *rio, R *iio, planner *plnr);
typedef int (*ct_force_vrecursion)(const ct_solver *ego,
const problem_dft *p);
typedef struct {
plan super;
dftwapply apply;
} plan_dftw;
extern plan *X(mkplan_dftw)(size_t size, const plan_adt *adt, dftwapply apply);
#define MKPLAN_DFTW(type, adt, apply) \
(type *)X(mkplan_dftw)(sizeof(type), adt, apply)
struct ct_solver_s {
solver super;
INT r;
int dec;
# define DECDIF 0
# define DECDIT 1
# define TRANSPOSE 2
ct_mkinferior mkcldw;
ct_force_vrecursion force_vrecursionp;
};
int X(ct_applicable)(const ct_solver *, const problem *, planner *);
ct_solver *X(mksolver_ct)(size_t size, INT r, int dec,
ct_mkinferior mkcldw,
ct_force_vrecursion force_vrecursionp);
extern ct_solver *(*X(mksolver_ct_hook))(size_t, INT, int,
ct_mkinferior, ct_force_vrecursion);
void X(regsolver_ct_directw)(planner *plnr,
kdftw codelet, const ct_desc *desc, int dec);
void X(regsolver_ct_directwbuf)(planner *plnr,
kdftw codelet, const ct_desc *desc, int dec);
solver *X(mksolver_ctsq)(kdftwsq codelet, const ct_desc *desc, int dec);
void X(regsolver_ct_directwsq)(planner *plnr, kdftwsq codelet,
const ct_desc *desc, int dec);
+88
View File
@@ -0,0 +1,88 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef __DFT_H__
#define __DFT_H__
#include "kernel/ifftw.h"
#include "dft/codelet-dft.h"
#ifdef __cplusplus
extern "C"
{
#endif /* __cplusplus */
/* problem.c: */
typedef struct {
problem super;
tensor *sz, *vecsz;
R *ri, *ii, *ro, *io;
} problem_dft;
void X(dft_zerotens)(tensor *sz, R *ri, R *ii);
problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz,
R *ri, R *ii, R *ro, R *io);
problem *X(mkproblem_dft_d)(tensor *sz, tensor *vecsz,
R *ri, R *ii, R *ro, R *io);
/* solve.c: */
void X(dft_solve)(const plan *ego_, const problem *p_);
/* plan.c: */
typedef void (*dftapply) (const plan *ego, R *ri, R *ii, R *ro, R *io);
typedef struct {
plan super;
dftapply apply;
} plan_dft;
plan *X(mkplan_dft)(size_t size, const plan_adt *adt, dftapply apply);
#define MKPLAN_DFT(type, adt, apply) \
(type *)X(mkplan_dft)(sizeof(type), adt, apply)
/* various solvers */
solver *X(mksolver_dft_direct)(kdft k, const kdft_desc *desc);
solver *X(mksolver_dft_directbuf)(kdft k, const kdft_desc *desc);
void X(dft_rank0_register)(planner *p);
void X(dft_rank_geq2_register)(planner *p);
void X(dft_indirect_register)(planner *p);
void X(dft_indirect_transpose_register)(planner *p);
void X(dft_vrank_geq1_register)(planner *p);
void X(dft_vrank2_transpose_register)(planner *p);
void X(dft_vrank3_transpose_register)(planner *p);
void X(dft_buffered_register)(planner *p);
void X(dft_generic_register)(planner *p);
void X(dft_rader_register)(planner *p);
void X(dft_bluestein_register)(planner *p);
void X(dft_nop_register)(planner *p);
void X(ct_generic_register)(planner *p);
void X(ct_genericbuf_register)(planner *p);
/* configurations */
void X(dft_conf_standard)(planner *p);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* __DFT_H__ */
+332
View File
@@ -0,0 +1,332 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/ct.h"
typedef struct {
ct_solver super;
const ct_desc *desc;
int bufferedp;
kdftw k;
} S;
typedef struct {
plan_dftw super;
kdftw k;
INT r;
stride rs;
INT m, ms, v, vs, mb, me, extra_iter;
stride brs;
twid *td;
const S *slv;
} P;
/*************************************************************
Nonbuffered code
*************************************************************/
static void apply(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
INT i;
ASSERT_ALIGNED_DOUBLE;
for (i = 0; i < ego->v; ++i, rio += ego->vs, iio += ego->vs) {
INT mb = ego->mb, ms = ego->ms;
ego->k(rio + mb*ms, iio + mb*ms, ego->td->W,
ego->rs, mb, ego->me, ms);
}
}
static void apply_extra_iter(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
INT i, v = ego->v, vs = ego->vs;
INT mb = ego->mb, me = ego->me, mm = me - 1, ms = ego->ms;
ASSERT_ALIGNED_DOUBLE;
for (i = 0; i < v; ++i, rio += vs, iio += vs) {
ego->k(rio + mb*ms, iio + mb*ms, ego->td->W,
ego->rs, mb, mm, ms);
ego->k(rio + mm*ms, iio + mm*ms, ego->td->W,
ego->rs, mm, mm+2, 0);
}
}
/*************************************************************
Buffered code
*************************************************************/
static void dobatch(const P *ego, R *rA, R *iA, INT mb, INT me, R *buf)
{
INT brs = WS(ego->brs, 1);
INT rs = WS(ego->rs, 1);
INT ms = ego->ms;
X(cpy2d_pair_ci)(rA + mb*ms, iA + mb*ms, buf, buf + 1,
ego->r, rs, brs,
me - mb, ms, 2);
ego->k(buf, buf + 1, ego->td->W, ego->brs, mb, me, 2);
X(cpy2d_pair_co)(buf, buf + 1, rA + mb*ms, iA + mb*ms,
ego->r, brs, rs,
me - mb, 2, ms);
}
/* must be even for SIMD alignment; should not be 2^k to avoid
associativity conflicts */
static INT compute_batchsize(INT radix)
{
/* round up to multiple of 4 */
radix += 3;
radix &= -4;
return (radix + 2);
}
static void apply_buf(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
INT i, j, v = ego->v, r = ego->r;
INT batchsz = compute_batchsize(r);
R *buf;
INT mb = ego->mb, me = ego->me;
size_t bufsz = r * batchsz * 2 * sizeof(R);
BUF_ALLOC(R *, buf, bufsz);
for (i = 0; i < v; ++i, rio += ego->vs, iio += ego->vs) {
for (j = mb; j + batchsz < me; j += batchsz)
dobatch(ego, rio, iio, j, j + batchsz, buf);
dobatch(ego, rio, iio, j, me, buf);
}
BUF_FREE(buf, bufsz);
}
/*************************************************************
common code
*************************************************************/
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
ego->r * ego->m, ego->r, ego->m + ego->extra_iter);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(stride_destroy)(ego->brs);
X(stride_destroy)(ego->rs);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *slv = ego->slv;
const ct_desc *e = slv->desc;
if (slv->bufferedp)
p->print(p, "(dftw-directbuf/%D-%D/%D%v \"%s\")",
compute_batchsize(ego->r), ego->r,
X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
else
p->print(p, "(dftw-direct-%D/%D%v \"%s\")",
ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
}
static int applicable0(const S *ego,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mb, INT me,
R *rio, R *iio,
const planner *plnr, INT *extra_iter)
{
const ct_desc *e = ego->desc;
UNUSED(v);
return (
1
&& r == e->radix
&& irs == ors /* in-place along R */
&& ivs == ovs /* in-place along V */
/* check for alignment/vector length restrictions */
&& ((*extra_iter = 0,
e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr))
||
(*extra_iter = 1,
(1
/* FIXME: require full array, otherwise some threads
may be extra_iter and other threads won't be.
Generating the proper twiddle factors is a pain in
this case */
&& mb == 0 && me == m
&& e->genus->okp(e, rio, iio, irs, ivs,
m, mb, me - 1, ms, plnr)
&& e->genus->okp(e, rio, iio, irs, ivs,
m, me - 1, me + 1, ms, plnr))))
&& (e->genus->okp(e, rio + ivs, iio + ivs, irs, ivs,
m, mb, me - *extra_iter, ms, plnr))
);
}
static int applicable0_buf(const S *ego,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mb, INT me,
R *rio, R *iio,
const planner *plnr)
{
const ct_desc *e = ego->desc;
INT batchsz;
UNUSED(v); UNUSED(ms); UNUSED(rio); UNUSED(iio);
return (
1
&& r == e->radix
&& irs == ors /* in-place along R */
&& ivs == ovs /* in-place along V */
/* check for alignment/vector length restrictions, both for
batchsize and for the remainder */
&& (batchsz = compute_batchsize(r), 1)
&& (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
m, mb, mb + batchsz, 2, plnr))
&& (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
m, mb, me, 2, plnr))
);
}
static int applicable(const S *ego,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mb, INT me,
R *rio, R *iio,
const planner *plnr, INT *extra_iter)
{
if (ego->bufferedp) {
*extra_iter = 0;
if (!applicable0_buf(ego,
r, irs, ors, m, ms, v, ivs, ovs, mb, me,
rio, iio, plnr))
return 0;
} else {
if (!applicable0(ego,
r, irs, ors, m, ms, v, ivs, ovs, mb, me,
rio, iio, plnr, extra_iter))
return 0;
}
if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
v, m * r, r))
return 0;
if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr))
return 0;
return 1;
}
static plan *mkcldw(const ct_solver *ego_,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mstart, INT mcount,
R *rio, R *iio,
planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const ct_desc *e = ego->desc;
INT extra_iter;
static const plan_adt padt = {
0, awake, print, destroy
};
A(mstart >= 0 && mstart + mcount <= m);
if (!applicable(ego,
r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
rio, iio, plnr, &extra_iter))
return (plan *)0;
if (ego->bufferedp) {
pln = MKPLAN_DFTW(P, &padt, apply_buf);
} else {
pln = MKPLAN_DFTW(P, &padt, extra_iter ? apply_extra_iter : apply);
}
pln->k = ego->k;
pln->rs = X(mkstride)(r, irs);
pln->td = 0;
pln->r = r;
pln->m = m;
pln->ms = ms;
pln->v = v;
pln->vs = ivs;
pln->mb = mstart;
pln->me = mstart + mcount;
pln->slv = ego;
pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
pln->extra_iter = extra_iter;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(v * (mcount/e->genus->vl), &e->ops, &pln->super.super.ops);
if (ego->bufferedp) {
/* 8 load/stores * N * V */
pln->super.super.ops.other += 8 * r * mcount * v;
}
pln->super.super.could_prune_now_p =
(!ego->bufferedp && r >= 5 && r < 64 && m >= r);
return &(pln->super.super);
}
static void regone(planner *plnr, kdftw codelet,
const ct_desc *desc, int dec, int bufferedp)
{
S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
slv->k = codelet;
slv->desc = desc;
slv->bufferedp = bufferedp;
REGISTER_SOLVER(plnr, &(slv->super.super));
if (X(mksolver_ct_hook)) {
slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix,
dec, mkcldw, 0);
slv->k = codelet;
slv->desc = desc;
slv->bufferedp = bufferedp;
REGISTER_SOLVER(plnr, &(slv->super.super));
}
}
void X(regsolver_ct_directw)(planner *plnr, kdftw codelet,
const ct_desc *desc, int dec)
{
regone(plnr, codelet, desc, dec, /* bufferedp */ 0);
regone(plnr, codelet, desc, dec, /* bufferedp */ 1);
}
+162
View File
@@ -0,0 +1,162 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/ct.h"
typedef struct {
ct_solver super;
const ct_desc *desc;
kdftwsq k;
} S;
typedef struct {
plan_dftw super;
kdftwsq k;
INT r;
stride rs, vs;
INT m, ms, v, mb, me;
twid *td;
const S *slv;
} P;
static void apply(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
INT mb = ego->mb, ms = ego->ms;
ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, ego->rs, ego->vs,
mb, ego->me, ms);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
ego->r * ego->m, ego->r, ego->m);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(stride_destroy)(ego->rs);
X(stride_destroy)(ego->vs);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *slv = ego->slv;
const ct_desc *e = slv->desc;
p->print(p, "(dftw-directsq-%D/%D%v \"%s\")",
ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
}
static int applicable(const S *ego,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mb, INT me,
R *rio, R *iio,
const planner *plnr)
{
const ct_desc *e = ego->desc;
UNUSED(v);
return (
1
&& r == e->radix
/* transpose r, v */
&& r == v
&& irs == ovs
&& ivs == ors
/* check for alignment/vector length restrictions */
&& e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr)
);
}
static plan *mkcldw(const ct_solver *ego_,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mstart, INT mcount,
R *rio, R *iio,
planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const ct_desc *e = ego->desc;
static const plan_adt padt = {
0, awake, print, destroy
};
A(mstart >= 0 && mstart + mcount <= m);
if (!applicable(ego,
r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
rio, iio, plnr))
return (plan *)0;
pln = MKPLAN_DFTW(P, &padt, apply);
pln->k = ego->k;
pln->rs = X(mkstride)(r, irs);
pln->vs = X(mkstride)(v, ivs);
pln->td = 0;
pln->r = r;
pln->m = m;
pln->ms = ms;
pln->v = v;
pln->mb = mstart;
pln->me = mstart + mcount;
pln->slv = ego;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(mcount/e->genus->vl, &e->ops, &pln->super.super.ops);
return &(pln->super.super);
}
static void regone(planner *plnr, kdftwsq codelet,
const ct_desc *desc, int dec)
{
S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
slv->k = codelet;
slv->desc = desc;
REGISTER_SOLVER(plnr, &(slv->super.super));
if (X(mksolver_ct_hook)) {
slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix, dec,
mkcldw, 0);
slv->k = codelet;
slv->desc = desc;
REGISTER_SOLVER(plnr, &(slv->super.super));
}
}
void X(regsolver_ct_directwsq)(planner *plnr, kdftwsq codelet,
const ct_desc *desc, int dec)
{
regone(plnr, codelet, desc, dec+TRANSPOSE);
}
+204
View File
@@ -0,0 +1,204 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* express a twiddle problem in terms of dft + multiplication by
twiddle factors */
#include "dft/ct.h"
typedef ct_solver S;
typedef struct {
plan_dftw super;
INT r, rs, m, mb, me, ms, v, vs;
plan *cld;
twid *td;
const S *slv;
int dec;
} P;
static void mktwiddle(P *ego, enum wakefulness wakefulness)
{
static const tw_instr tw[] = { { TW_FULL, 0, 0 }, { TW_NEXT, 1, 0 } };
/* note that R and M are swapped, to allow for sequential
access both to data and twiddles */
X(twiddle_awake)(wakefulness, &ego->td, tw,
ego->r * ego->m, ego->m, ego->r);
}
static void bytwiddle(const P *ego, R *rio, R *iio)
{
INT iv, ir, im;
INT r = ego->r, rs = ego->rs;
INT m = ego->m, mb = ego->mb, me = ego->me, ms = ego->ms;
INT v = ego->v, vs = ego->vs;
const R *W = ego->td->W;
mb += (mb == 0); /* skip m=0 iteration */
for (iv = 0; iv < v; ++iv) {
for (ir = 1; ir < r; ++ir) {
for (im = mb; im < me; ++im) {
R *pr = rio + ms * im + rs * ir;
R *pi = iio + ms * im + rs * ir;
E xr = *pr;
E xi = *pi;
E wr = W[2 * im + (2 * (m-1)) * ir - 2];
E wi = W[2 * im + (2 * (m-1)) * ir - 1];
*pr = xr * wr + xi * wi;
*pi = xi * wr - xr * wi;
}
}
rio += vs;
iio += vs;
}
}
static int applicable(INT irs, INT ors, INT ivs, INT ovs,
const planner *plnr)
{
return (1
&& irs == ors
&& ivs == ovs
&& !NO_SLOWP(plnr)
);
}
static void apply_dit(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
INT dm = ego->ms * ego->mb;
bytwiddle(ego, rio, iio);
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, rio + dm, iio + dm, rio + dm, iio + dm);
}
static void apply_dif(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
INT dm = ego->ms * ego->mb;
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, rio + dm, iio + dm, rio + dm, iio + dm);
bytwiddle(ego, rio, iio);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
mktwiddle(ego, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dftw-generic-%s-%D-%D%v%(%p%))",
ego->dec == DECDIT ? "dit" : "dif",
ego->r, ego->m, ego->v, ego->cld);
}
static plan *mkcldw(const ct_solver *ego_,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mstart, INT mcount,
R *rio, R *iio,
planner *plnr)
{
const S *ego = (const S *)ego_;
P *pln;
plan *cld = 0;
INT dm = ms * mstart;
static const plan_adt padt = {
0, awake, print, destroy
};
A(mstart >= 0 && mstart + mcount <= m);
if (!applicable(irs, ors, ivs, ovs, plnr))
return (plan *)0;
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(r, irs, irs),
X(mktensor_2d)(mcount, ms, ms, v, ivs, ivs),
rio + dm, iio + dm, rio + dm, iio + dm)
);
if (!cld) goto nada;
pln = MKPLAN_DFTW(P, &padt, ego->dec == DECDIT ? apply_dit : apply_dif);
pln->slv = ego;
pln->cld = cld;
pln->r = r;
pln->rs = irs;
pln->m = m;
pln->ms = ms;
pln->v = v;
pln->vs = ivs;
pln->mb = mstart;
pln->me = mstart + mcount;
pln->dec = ego->dec;
pln->td = 0;
{
double n0 = (r - 1) * (mcount - 1) * v;
pln->super.super.ops = cld->ops;
pln->super.super.ops.mul += 8 * n0;
pln->super.super.ops.add += 4 * n0;
pln->super.super.ops.other += 8 * n0;
}
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
static void regsolver(planner *plnr, INT r, int dec)
{
S *slv = (S *)X(mksolver_ct)(sizeof(S), r, dec, mkcldw, 0);
REGISTER_SOLVER(plnr, &(slv->super));
if (X(mksolver_ct_hook)) {
slv = (S *)X(mksolver_ct_hook)(sizeof(S), r, dec, mkcldw, 0);
REGISTER_SOLVER(plnr, &(slv->super));
}
}
void X(ct_generic_register)(planner *p)
{
regsolver(p, 0, DECDIT);
regsolver(p, 0, DECDIF);
}
+231
View File
@@ -0,0 +1,231 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* express a twiddle problem in terms of dft + multiplication by
twiddle factors */
#include "dft/ct.h"
typedef struct {
ct_solver super;
INT batchsz;
} S;
typedef struct {
plan_dftw super;
INT r, rs, m, ms, v, vs, mb, me;
INT batchsz;
plan *cld;
triggen *t;
const S *slv;
} P;
#define BATCHDIST(r) ((r) + 16)
/**************************************************************/
static void bytwiddle(const P *ego, INT mb, INT me, R *buf, R *rio, R *iio)
{
INT j, k;
INT r = ego->r, rs = ego->rs, ms = ego->ms;
triggen *t = ego->t;
for (j = 0; j < r; ++j) {
for (k = mb; k < me; ++k)
t->rotate(t, j * k,
rio[j * rs + k * ms],
iio[j * rs + k * ms],
&buf[j * 2 + 2 * BATCHDIST(r) * (k - mb) + 0]);
}
}
static int applicable0(const S *ego,
INT r, INT irs, INT ors,
INT m, INT v,
INT mcount)
{
return (1
&& v == 1
&& irs == ors
&& mcount >= ego->batchsz
&& mcount % ego->batchsz == 0
&& r >= 64
&& m >= r
);
}
static int applicable(const S *ego,
INT r, INT irs, INT ors,
INT m, INT v,
INT mcount,
const planner *plnr)
{
if (!applicable0(ego, r, irs, ors, m, v, mcount))
return 0;
if (NO_UGLYP(plnr) && m * r < 65536)
return 0;
return 1;
}
static void dobatch(const P *ego, INT mb, INT me, R *buf, R *rio, R *iio)
{
plan_dft *cld;
INT ms = ego->ms;
bytwiddle(ego, mb, me, buf, rio, iio);
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, buf, buf + 1, buf, buf + 1);
X(cpy2d_pair_co)(buf, buf + 1,
rio + ms * mb, iio + ms * mb,
me-mb, 2 * BATCHDIST(ego->r), ms,
ego->r, 2, ego->rs);
}
static void apply(const plan *ego_, R *rio, R *iio)
{
const P *ego = (const P *) ego_;
R *buf = (R *) MALLOC(sizeof(R) * 2 * BATCHDIST(ego->r) * ego->batchsz,
BUFFERS);
INT m;
for (m = ego->mb; m < ego->me; m += ego->batchsz)
dobatch(ego, m, m + ego->batchsz, buf, rio, iio);
A(m == ego->me);
X(ifree)(buf);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
switch (wakefulness) {
case SLEEPY:
X(triggen_destroy)(ego->t); ego->t = 0;
break;
default:
ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
break;
}
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dftw-genericbuf/%D-%D-%D%(%p%))",
ego->batchsz, ego->r, ego->m, ego->cld);
}
static plan *mkcldw(const ct_solver *ego_,
INT r, INT irs, INT ors,
INT m, INT ms,
INT v, INT ivs, INT ovs,
INT mstart, INT mcount,
R *rio, R *iio,
planner *plnr)
{
const S *ego = (const S *)ego_;
P *pln;
plan *cld = 0;
R *buf;
static const plan_adt padt = {
0, awake, print, destroy
};
UNUSED(ivs); UNUSED(ovs); UNUSED(rio); UNUSED(iio);
A(mstart >= 0 && mstart + mcount <= m);
if (!applicable(ego, r, irs, ors, m, v, mcount, plnr))
return (plan *)0;
buf = (R *) MALLOC(sizeof(R) * 2 * BATCHDIST(r) * ego->batchsz, BUFFERS);
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(r, 2, 2),
X(mktensor_1d)(ego->batchsz,
2 * BATCHDIST(r),
2 * BATCHDIST(r)),
buf, buf + 1, buf, buf + 1
)
);
X(ifree)(buf);
if (!cld) goto nada;
pln = MKPLAN_DFTW(P, &padt, apply);
pln->slv = ego;
pln->cld = cld;
pln->r = r;
pln->m = m;
pln->ms = ms;
pln->rs = irs;
pln->batchsz = ego->batchsz;
pln->mb = mstart;
pln->me = mstart + mcount;
{
double n0 = (r - 1) * (mcount - 1);
pln->super.super.ops = cld->ops;
pln->super.super.ops.mul += 8 * n0;
pln->super.super.ops.add += 4 * n0;
pln->super.super.ops.other += 8 * n0;
}
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
static void regsolver(planner *plnr, INT r, INT batchsz)
{
S *slv = (S *)X(mksolver_ct)(sizeof(S), r, DECDIT, mkcldw, 0);
slv->batchsz = batchsz;
REGISTER_SOLVER(plnr, &(slv->super.super));
if (X(mksolver_ct_hook)) {
slv = (S *)X(mksolver_ct_hook)(sizeof(S), r, DECDIT, mkcldw, 0);
slv->batchsz = batchsz;
REGISTER_SOLVER(plnr, &(slv->super.super));
}
}
void X(ct_genericbuf_register)(planner *p)
{
static const INT radices[] = { -1, -2, -4, -8, -16, -32, -64 };
static const INT batchsizes[] = { 4, 8, 16, 32, 64 };
unsigned i, j;
for (i = 0; i < sizeof(radices) / sizeof(radices[0]); ++i)
for (j = 0; j < sizeof(batchsizes) / sizeof(batchsizes[0]); ++j)
regsolver(p, radices[i], batchsizes[j]);
}
+293
View File
@@ -0,0 +1,293 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* direct DFT solver, if we have a codelet */
#include "dft/dft.h"
typedef struct {
solver super;
const kdft_desc *desc;
kdft k;
int bufferedp;
} S;
typedef struct {
plan_dft super;
stride is, os, bufstride;
INT n, vl, ivs, ovs;
kdft k;
const S *slv;
} P;
static void dobatch(const P *ego, R *ri, R *ii, R *ro, R *io,
R *buf, INT batchsz)
{
X(cpy2d_pair_ci)(ri, ii, buf, buf+1,
ego->n, WS(ego->is, 1), WS(ego->bufstride, 1),
batchsz, ego->ivs, 2);
if (IABS(WS(ego->os, 1)) < IABS(ego->ovs)) {
/* transform directly to output */
ego->k(buf, buf+1, ro, io,
ego->bufstride, ego->os, batchsz, 2, ego->ovs);
} else {
/* transform to buffer and copy back */
ego->k(buf, buf+1, buf, buf+1,
ego->bufstride, ego->bufstride, batchsz, 2, 2);
X(cpy2d_pair_co)(buf, buf+1, ro, io,
ego->n, WS(ego->bufstride, 1), WS(ego->os, 1),
batchsz, 2, ego->ovs);
}
}
static INT compute_batchsize(INT n)
{
/* round up to multiple of 4 */
n += 3;
n &= -4;
return (n + 2);
}
static void apply_buf(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
R *buf;
INT vl = ego->vl, n = ego->n, batchsz = compute_batchsize(n);
INT i;
size_t bufsz = n * batchsz * 2 * sizeof(R);
BUF_ALLOC(R *, buf, bufsz);
for (i = 0; i < vl - batchsz; i += batchsz) {
dobatch(ego, ri, ii, ro, io, buf, batchsz);
ri += batchsz * ego->ivs; ii += batchsz * ego->ivs;
ro += batchsz * ego->ovs; io += batchsz * ego->ovs;
}
dobatch(ego, ri, ii, ro, io, buf, vl - i);
BUF_FREE(buf, bufsz);
}
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
ASSERT_ALIGNED_DOUBLE;
ego->k(ri, ii, ro, io, ego->is, ego->os, ego->vl, ego->ivs, ego->ovs);
}
static void apply_extra_iter(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT vl = ego->vl;
ASSERT_ALIGNED_DOUBLE;
/* for 4-way SIMD when VL is odd: iterate over an
even vector length VL, and then execute the last
iteration as a 2-vector with vector stride 0. */
ego->k(ri, ii, ro, io, ego->is, ego->os, vl - 1, ego->ivs, ego->ovs);
ego->k(ri + (vl - 1) * ego->ivs, ii + (vl - 1) * ego->ivs,
ro + (vl - 1) * ego->ovs, io + (vl - 1) * ego->ovs,
ego->is, ego->os, 1, 0, 0);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(stride_destroy)(ego->is);
X(stride_destroy)(ego->os);
X(stride_destroy)(ego->bufstride);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->slv;
const kdft_desc *d = s->desc;
if (ego->slv->bufferedp)
p->print(p, "(dft-directbuf/%D-%D%v \"%s\")",
compute_batchsize(d->sz), d->sz, ego->vl, d->nam);
else
p->print(p, "(dft-direct-%D%v \"%s\")", d->sz, ego->vl, d->nam);
}
static int applicable_buf(const solver *ego_, const problem *p_,
const planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_dft *p = (const problem_dft *) p_;
const kdft_desc *d = ego->desc;
INT vl;
INT ivs, ovs;
INT batchsz;
return (
1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 1
&& p->sz->dims[0].n == d->sz
/* check strides etc */
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
/* UGLY if IS <= IVS */
&& !(NO_UGLYP(plnr) &&
X(iabs)(p->sz->dims[0].is) <= X(iabs)(ivs))
&& (batchsz = compute_batchsize(d->sz), 1)
&& (d->genus->okp(d, 0, ((const R *)0) + 1, p->ro, p->io,
2 * batchsz, p->sz->dims[0].os,
batchsz, 2, ovs, plnr))
&& (d->genus->okp(d, 0, ((const R *)0) + 1, p->ro, p->io,
2 * batchsz, p->sz->dims[0].os,
vl % batchsz, 2, ovs, plnr))
&& (0
/* can operate out-of-place */
|| p->ri != p->ro
/* can operate in-place as long as strides are the same */
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
/* can do it if the problem fits in the buffer, no matter
what the strides are */
|| vl <= batchsz
)
);
}
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr, int *extra_iterp)
{
const S *ego = (const S *) ego_;
const problem_dft *p = (const problem_dft *) p_;
const kdft_desc *d = ego->desc;
INT vl;
INT ivs, ovs;
return (
1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& p->sz->dims[0].n == d->sz
/* check strides etc */
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
&& ((*extra_iterp = 0,
(d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
p->sz->dims[0].is, p->sz->dims[0].os,
vl, ivs, ovs, plnr)))
||
(*extra_iterp = 1,
((d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
p->sz->dims[0].is, p->sz->dims[0].os,
vl - 1, ivs, ovs, plnr))
&&
(d->genus->okp(d, p->ri, p->ii, p->ro, p->io,
p->sz->dims[0].is, p->sz->dims[0].os,
2, 0, 0, plnr)))))
&& (0
/* can operate out-of-place */
|| p->ri != p->ro
/* can always compute one transform */
|| vl == 1
/* can operate in-place as long as strides are the same */
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
)
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const problem_dft *p;
iodim *d;
const kdft_desc *e = ego->desc;
static const plan_adt padt = {
X(dft_solve), X(null_awake), print, destroy
};
UNUSED(plnr);
if (ego->bufferedp) {
if (!applicable_buf(ego_, p_, plnr))
return (plan *)0;
pln = MKPLAN_DFT(P, &padt, apply_buf);
} else {
int extra_iterp = 0;
if (!applicable(ego_, p_, plnr, &extra_iterp))
return (plan *)0;
pln = MKPLAN_DFT(P, &padt, extra_iterp ? apply_extra_iter : apply);
}
p = (const problem_dft *) p_;
d = p->sz->dims;
pln->k = ego->k;
pln->n = d[0].n;
pln->is = X(mkstride)(pln->n, d[0].is);
pln->os = X(mkstride)(pln->n, d[0].os);
pln->bufstride = X(mkstride)(pln->n, 2 * compute_batchsize(pln->n));
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
pln->slv = ego;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(pln->vl / e->genus->vl, &e->ops, &pln->super.super.ops);
if (ego->bufferedp)
pln->super.super.ops.other += 4 * pln->n * pln->vl;
pln->super.super.could_prune_now_p = !ego->bufferedp;
return &(pln->super.super);
}
static solver *mksolver(kdft k, const kdft_desc *desc, int bufferedp)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->k = k;
slv->desc = desc;
slv->bufferedp = bufferedp;
return &(slv->super);
}
solver *X(mksolver_dft_direct)(kdft k, const kdft_desc *desc)
{
return mksolver(k, desc, 0);
}
solver *X(mksolver_dft_directbuf)(kdft k, const kdft_desc *desc)
{
return mksolver(k, desc, 1);
}
+169
View File
@@ -0,0 +1,169 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
typedef struct {
solver super;
} S;
typedef struct {
plan_dft super;
twid *td;
INT n, is, os;
} P;
static void cdot(INT n, const E *x, const R *w,
R *or0, R *oi0, R *or1, R *oi1)
{
INT i;
E rr = x[0], ri = 0, ir = x[1], ii = 0;
x += 2;
for (i = 1; i + i < n; ++i) {
rr += x[0] * w[0];
ir += x[1] * w[0];
ri += x[2] * w[1];
ii += x[3] * w[1];
x += 4; w += 2;
}
*or0 = rr + ii;
*oi0 = ir - ri;
*or1 = rr - ii;
*oi1 = ir + ri;
}
static void hartley(INT n, const R *xr, const R *xi, INT xs, E *o,
R *pr, R *pi)
{
INT i;
E sr, si;
o[0] = sr = xr[0]; o[1] = si = xi[0]; o += 2;
for (i = 1; i + i < n; ++i) {
sr += (o[0] = xr[i * xs] + xr[(n - i) * xs]);
si += (o[1] = xi[i * xs] + xi[(n - i) * xs]);
o[2] = xr[i * xs] - xr[(n - i) * xs];
o[3] = xi[i * xs] - xi[(n - i) * xs];
o += 4;
}
*pr = sr;
*pi = si;
}
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT i;
INT n = ego->n, is = ego->is, os = ego->os;
const R *W = ego->td->W;
E *buf;
size_t bufsz = n * 2 * sizeof(E);
BUF_ALLOC(E *, buf, bufsz);
hartley(n, ri, ii, is, buf, ro, io);
for (i = 1; i + i < n; ++i) {
cdot(n, buf, W,
ro + i * os, io + i * os,
ro + (n - i) * os, io + (n - i) * os);
W += n - 1;
}
BUF_FREE(buf, bufsz);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
static const tw_instr half_tw[] = {
{ TW_HALF, 1, 0 },
{ TW_NEXT, 1, 0 }
};
X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n,
(ego->n - 1) / 2);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dft-generic-%D)", ego->n);
}
static int applicable(const solver *ego, const problem *p_,
const planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
UNUSED(ego);
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& (p->sz->dims[0].n % 2) == 1
&& CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
&& X(is_prime)(p->sz->dims[0].n)
);
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_dft *p;
P *pln;
INT n;
static const plan_adt padt = {
X(dft_solve), awake, print, X(plan_null_destroy)
};
if (!applicable(ego, p_, plnr))
return (plan *)0;
pln = MKPLAN_DFT(P, &padt, apply);
p = (const problem_dft *) p_;
pln->n = n = p->sz->dims[0].n;
pln->is = p->sz->dims[0].is;
pln->os = p->sz->dims[0].os;
pln->td = 0;
pln->super.super.ops.add = (n-1) * 5;
pln->super.super.ops.mul = 0;
pln->super.super.ops.fma = (n-1) * (n-1) ;
#if 0 /* these are nice pipelined sequential loads and should cost nothing */
pln->super.super.ops.other = (n-1)*(4 + 1 + 2 * (n-1)); /* approximate */
#endif
return &(pln->super.super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(dft_generic_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}
+234
View File
@@ -0,0 +1,234 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* solvers/plans for vectors of DFTs corresponding to the columns
of a matrix: first transpose the matrix so that the DFTs are
contiguous, then do DFTs with transposed output. In particular,
we restrict ourselves to the case of a square transpose (or a
sequence thereof). */
#include "dft/dft.h"
typedef solver S;
typedef struct {
plan_dft super;
INT vl, ivs, ovs;
plan *cldtrans, *cld, *cldrest;
} P;
/* initial transpose is out-of-place from input to output */
static void apply_op(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT vl = ego->vl, ivs = ego->ivs, ovs = ego->ovs, i;
for (i = 0; i < vl; ++i) {
{
plan_dft *cldtrans = (plan_dft *) ego->cldtrans;
cldtrans->apply(ego->cldtrans, ri, ii, ro, io);
}
{
plan_dft *cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, ro, io, ro, io);
}
ri += ivs; ii += ivs;
ro += ovs; io += ovs;
}
{
plan_dft *cldrest = (plan_dft *) ego->cldrest;
cldrest->apply(ego->cldrest, ri, ii, ro, io);
}
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldrest);
X(plan_destroy_internal)(ego->cld);
X(plan_destroy_internal)(ego->cldtrans);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldtrans, wakefulness);
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldrest, wakefulness);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(indirect-transpose%v%(%p%)%(%p%)%(%p%))",
ego->vl, ego->cldtrans, ego->cld, ego->cldrest);
}
static int pickdim(const tensor *vs, const tensor *s, int *pdim0, int *pdim1)
{
int dim0, dim1;
*pdim0 = *pdim1 = -1;
for (dim0 = 0; dim0 < vs->rnk; ++dim0)
for (dim1 = 0; dim1 < s->rnk; ++dim1)
if (vs->dims[dim0].n * X(iabs)(vs->dims[dim0].is) <= X(iabs)(s->dims[dim1].is)
&& vs->dims[dim0].n >= s->dims[dim1].n
&& (*pdim0 == -1
|| (X(iabs)(vs->dims[dim0].is) <= X(iabs)(vs->dims[*pdim0].is)
&& X(iabs)(s->dims[dim1].is) >= X(iabs)(s->dims[*pdim1].is)))) {
*pdim0 = dim0;
*pdim1 = dim1;
}
return (*pdim0 != -1 && *pdim1 != -1);
}
static int applicable0(const solver *ego_, const problem *p_,
const planner *plnr,
int *pdim0, int *pdim1)
{
const problem_dft *p = (const problem_dft *) p_;
UNUSED(ego_); UNUSED(plnr);
return (1
&& FINITE_RNK(p->vecsz->rnk) && FINITE_RNK(p->sz->rnk)
/* FIXME: can/should we relax this constraint? */
&& X(tensor_inplace_strides2)(p->vecsz, p->sz)
&& pickdim(p->vecsz, p->sz, pdim0, pdim1)
/* output should not *already* include the transpose
(in which case we duplicate the regular indirect.c) */
&& (p->sz->dims[*pdim1].os != p->vecsz->dims[*pdim0].is)
);
}
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr,
int *pdim0, int *pdim1)
{
if (!applicable0(ego_, p_, plnr, pdim0, pdim1)) return 0;
{
const problem_dft *p = (const problem_dft *) p_;
INT u = p->ri == p->ii + 1 || p->ii == p->ri + 1 ? (INT)2 : (INT)1;
/* UGLY if does not result in contiguous transforms or
transforms of contiguous vectors (since the latter at
least have efficient transpositions) */
if (NO_UGLYP(plnr)
&& p->vecsz->dims[*pdim0].is != u
&& !(p->vecsz->rnk == 2
&& p->vecsz->dims[1-*pdim0].is == u
&& p->vecsz->dims[*pdim0].is
== u * p->vecsz->dims[1-*pdim0].n))
return 0;
if (NO_INDIRECT_OP_P(plnr) && p->ri != p->ro) return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
P *pln;
plan *cld = 0, *cldtrans = 0, *cldrest = 0;
int pdim0, pdim1;
tensor *ts, *tv;
INT vl, ivs, ovs;
R *rit, *iit, *rot, *iot;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr, &pdim0, &pdim1))
return (plan *) 0;
vl = p->vecsz->dims[pdim0].n / p->sz->dims[pdim1].n;
A(vl >= 1);
ivs = p->sz->dims[pdim1].n * p->vecsz->dims[pdim0].is;
ovs = p->sz->dims[pdim1].n * p->vecsz->dims[pdim0].os;
rit = TAINT(p->ri, vl == 1 ? 0 : ivs);
iit = TAINT(p->ii, vl == 1 ? 0 : ivs);
rot = TAINT(p->ro, vl == 1 ? 0 : ovs);
iot = TAINT(p->io, vl == 1 ? 0 : ovs);
ts = X(tensor_copy_inplace)(p->sz, INPLACE_IS);
ts->dims[pdim1].os = p->vecsz->dims[pdim0].is;
tv = X(tensor_copy_inplace)(p->vecsz, INPLACE_IS);
tv->dims[pdim0].os = p->sz->dims[pdim1].is;
tv->dims[pdim0].n = p->sz->dims[pdim1].n;
cldtrans = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_0d)(),
X(tensor_append)(tv, ts),
rit, iit,
rot, iot));
X(tensor_destroy2)(ts, tv);
if (!cldtrans) goto nada;
ts = X(tensor_copy)(p->sz);
ts->dims[pdim1].is = p->vecsz->dims[pdim0].is;
tv = X(tensor_copy)(p->vecsz);
tv->dims[pdim0].is = p->sz->dims[pdim1].is;
tv->dims[pdim0].n = p->sz->dims[pdim1].n;
cld = X(mkplan_d)(plnr, X(mkproblem_dft_d)(ts, tv,
rot, iot,
rot, iot));
if (!cld) goto nada;
tv = X(tensor_copy)(p->vecsz);
tv->dims[pdim0].n -= vl * p->sz->dims[pdim1].n;
cldrest = X(mkplan_d)(plnr, X(mkproblem_dft_d)(X(tensor_copy)(p->sz), tv,
p->ri + ivs * vl,
p->ii + ivs * vl,
p->ro + ovs * vl,
p->io + ovs * vl));
if (!cldrest) goto nada;
pln = MKPLAN_DFT(P, &padt, apply_op);
pln->cldtrans = cldtrans;
pln->cld = cld;
pln->cldrest = cldrest;
pln->vl = vl;
pln->ivs = ivs;
pln->ovs = ovs;
X(ops_cpy)(&cldrest->ops, &pln->super.super.ops);
X(ops_madd2)(vl, &cld->ops, &pln->super.super.ops);
X(ops_madd2)(vl, &cldtrans->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldrest);
X(plan_destroy_internal)(cld);
X(plan_destroy_internal)(cldtrans);
return (plan *)0;
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return slv;
}
void X(dft_indirect_transpose_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}
+240
View File
@@ -0,0 +1,240 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* solvers/plans for vectors of small DFT's that cannot be done
in-place directly. Use a rank-0 plan to rearrange the data
before or after the transform. Can also change an out-of-place
plan into a copy + in-place (where the in-place transform
is e.g. unit stride). */
/* FIXME: merge with rank-geq2.c(?), since this is just a special case
of a rank split where the first/second transform has rank 0. */
#include "dft/dft.h"
typedef problem *(*mkcld_t) (const problem_dft *p);
typedef struct {
dftapply apply;
problem *(*mkcld)(const problem_dft *p);
const char *nam;
} ndrct_adt;
typedef struct {
solver super;
const ndrct_adt *adt;
} S;
typedef struct {
plan_dft super;
plan *cldcpy, *cld;
const S *slv;
} P;
/*-----------------------------------------------------------------------*/
/* first rearrange, then transform */
static void apply_before(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
{
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
cldcpy->apply(ego->cldcpy, ri, ii, ro, io);
}
{
plan_dft *cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, ro, io, ro, io);
}
}
static problem *mkcld_before(const problem_dft *p)
{
return X(mkproblem_dft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_OS),
X(tensor_copy_inplace)(p->vecsz, INPLACE_OS),
p->ro, p->io, p->ro, p->io);
}
static const ndrct_adt adt_before =
{
apply_before, mkcld_before, "dft-indirect-before"
};
/*-----------------------------------------------------------------------*/
/* first transform, then rearrange */
static void apply_after(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
{
plan_dft *cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, ri, ii, ri, ii);
}
{
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
cldcpy->apply(ego->cldcpy, ri, ii, ro, io);
}
}
static problem *mkcld_after(const problem_dft *p)
{
return X(mkproblem_dft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_IS),
X(tensor_copy_inplace)(p->vecsz, INPLACE_IS),
p->ri, p->ii, p->ri, p->ii);
}
static const ndrct_adt adt_after =
{
apply_after, mkcld_after, "dft-indirect-after"
};
/*-----------------------------------------------------------------------*/
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
X(plan_destroy_internal)(ego->cldcpy);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldcpy, wakefulness);
X(plan_awake)(ego->cld, wakefulness);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->slv;
p->print(p, "(%s%(%p%)%(%p%))", s->adt->nam, ego->cld, ego->cldcpy);
}
static int applicable0(const solver *ego_, const problem *p_,
const planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_dft *p = (const problem_dft *) p_;
return (1
&& FINITE_RNK(p->vecsz->rnk)
/* problem must be a nontrivial transform, not just a copy */
&& p->sz->rnk > 0
&& (0
/* problem must be in-place & require some
rearrangement of the data; to prevent
infinite loops with indirect-transpose, we
further require that at least some transform
strides must decrease */
|| (p->ri == p->ro
&& !X(tensor_inplace_strides2)(p->sz, p->vecsz)
&& X(tensor_strides_decrease)(
p->sz, p->vecsz,
ego->adt->apply == apply_after ?
INPLACE_IS : INPLACE_OS))
/* or problem must be out of place, transforming
from stride 1/2 to bigger stride, for apply_after */
|| (p->ri != p->ro && ego->adt->apply == apply_after
&& !NO_DESTROY_INPUTP(plnr)
&& X(tensor_min_istride)(p->sz) <= 2
&& X(tensor_min_ostride)(p->sz) > 2)
/* or problem must be out of place, transforming
to stride 1/2 from bigger stride, for apply_before */
|| (p->ri != p->ro && ego->adt->apply == apply_before
&& X(tensor_min_ostride)(p->sz) <= 2
&& X(tensor_min_istride)(p->sz) > 2)
)
);
}
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr)
{
if (!applicable0(ego_, p_, plnr)) return 0;
{
const problem_dft *p = (const problem_dft *) p_;
if (NO_INDIRECT_OP_P(plnr) && p->ri != p->ro) return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
const S *ego = (const S *) ego_;
P *pln;
plan *cld = 0, *cldcpy = 0;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr))
return (plan *) 0;
cldcpy =
X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_0d)(),
X(tensor_append)(p->vecsz, p->sz),
p->ri, p->ii, p->ro, p->io));
if (!cldcpy) goto nada;
cld = X(mkplan_f_d)(plnr, ego->adt->mkcld(p), NO_BUFFERING, 0, 0);
if (!cld) goto nada;
pln = MKPLAN_DFT(P, &padt, ego->adt->apply);
pln->cld = cld;
pln->cldcpy = cldcpy;
pln->slv = ego;
X(ops_add)(&cld->ops, &cldcpy->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld);
X(plan_destroy_internal)(cldcpy);
return (plan *)0;
}
static solver *mksolver(const ndrct_adt *adt)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->adt = adt;
return &(slv->super);
}
void X(dft_indirect_register)(planner *p)
{
unsigned i;
static const ndrct_adt *const adts[] = {
&adt_before, &adt_after
};
for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
REGISTER_SOLVER(p, mksolver(adts[i]));
}
+27
View File
@@ -0,0 +1,27 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/ct.h"
void X(kdft_dif_register)(planner *p, kdftw codelet, const ct_desc *desc)
{
X(regsolver_ct_directw)(p, codelet, desc, DECDIF);
}
+27
View File
@@ -0,0 +1,27 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/ct.h"
void X(kdft_difsq_register)(planner *p, kdftwsq k, const ct_desc *desc)
{
X(regsolver_ct_directwsq)(p, k, desc, DECDIF);
}
+27
View File
@@ -0,0 +1,27 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/ct.h"
void X(kdft_dit_register)(planner *p, kdftw codelet, const ct_desc *desc)
{
X(regsolver_ct_directw)(p, codelet, desc, DECDIT);
}
+28
View File
@@ -0,0 +1,28 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
void X(kdft_register)(planner *p, kdft codelet, const kdft_desc *desc)
{
REGISTER_SOLVER(p, X(mksolver_dft_direct)(codelet, desc));
REGISTER_SOLVER(p, X(mksolver_dft_directbuf)(codelet, desc));
}
+86
View File
@@ -0,0 +1,86 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for vrank -infty DFTs (nothing to do) */
#include "dft/dft.h"
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
UNUSED(ego_);
UNUSED(ri);
UNUSED(ii);
UNUSED(ro);
UNUSED(io);
}
static int applicable(const solver *ego_, const problem *p_)
{
const problem_dft *p = (const problem_dft *) p_;
UNUSED(ego_);
return 0
/* case 1 : -infty vector rank */
|| (!FINITE_RNK(p->vecsz->rnk))
/* case 2 : rank-0 in-place dft */
|| (1
&& p->sz->rnk == 0
&& FINITE_RNK(p->vecsz->rnk)
&& p->ro == p->ri
&& X(tensor_inplace_strides)(p->vecsz)
);
}
static void print(const plan *ego, printer *p)
{
UNUSED(ego);
p->print(p, "(dft-nop)");
}
static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
{
static const plan_adt padt = {
X(dft_solve), X(null_awake), print, X(plan_null_destroy)
};
plan_dft *pln;
UNUSED(plnr);
if (!applicable(ego, p))
return (plan *) 0;
pln = MKPLAN_DFT(plan_dft, &padt, apply);
X(ops_zero)(&pln->super.ops);
return &(pln->super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
return MKSOLVER(solver, &sadt);
}
void X(dft_nop_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}
+32
View File
@@ -0,0 +1,32 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
plan *X(mkplan_dft)(size_t size, const plan_adt *adt, dftapply apply)
{
plan_dft *ego;
ego = (plan_dft *) X(mkplan)(size, adt);
ego->apply = apply;
return &(ego->super);
}
+121
View File
@@ -0,0 +1,121 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
#include <stddef.h>
static void destroy(problem *ego_)
{
problem_dft *ego = (problem_dft *) ego_;
X(tensor_destroy2)(ego->vecsz, ego->sz);
X(ifree)(ego_);
}
static void hash(const problem *p_, md5 *m)
{
const problem_dft *p = (const problem_dft *) p_;
X(md5puts)(m, "dft");
X(md5int)(m, p->ri == p->ro);
X(md5INT)(m, p->ii - p->ri);
X(md5INT)(m, p->io - p->ro);
X(md5int)(m, X(ialignment_of)(p->ri));
X(md5int)(m, X(ialignment_of)(p->ii));
X(md5int)(m, X(ialignment_of)(p->ro));
X(md5int)(m, X(ialignment_of)(p->io));
X(tensor_md5)(m, p->sz);
X(tensor_md5)(m, p->vecsz);
}
static void print(const problem *ego_, printer *p)
{
const problem_dft *ego = (const problem_dft *) ego_;
p->print(p, "(dft %d %d %d %D %D %T %T)",
ego->ri == ego->ro,
X(ialignment_of)(ego->ri),
X(ialignment_of)(ego->ro),
(INT)(ego->ii - ego->ri),
(INT)(ego->io - ego->ro),
ego->sz,
ego->vecsz);
}
static void zero(const problem *ego_)
{
const problem_dft *ego = (const problem_dft *) ego_;
tensor *sz = X(tensor_append)(ego->vecsz, ego->sz);
X(dft_zerotens)(sz, UNTAINT(ego->ri), UNTAINT(ego->ii));
X(tensor_destroy)(sz);
}
static const problem_adt padt =
{
PROBLEM_DFT,
hash,
zero,
print,
destroy
};
problem *X(mkproblem_dft)(const tensor *sz, const tensor *vecsz,
R *ri, R *ii, R *ro, R *io)
{
problem_dft *ego;
/* enforce pointer equality if untainted pointers are equal */
if (UNTAINT(ri) == UNTAINT(ro))
ri = ro = JOIN_TAINT(ri, ro);
if (UNTAINT(ii) == UNTAINT(io))
ii = io = JOIN_TAINT(ii, io);
/* more correctness conditions: */
A(TAINTOF(ri) == TAINTOF(ii));
A(TAINTOF(ro) == TAINTOF(io));
A(X(tensor_kosherp)(sz));
A(X(tensor_kosherp)(vecsz));
if (ri == ro || ii == io) {
/* If either real or imag pointers are in place, both must be. */
if (ri != ro || ii != io || !X(tensor_inplace_locations)(sz, vecsz))
return X(mkproblem_unsolvable)();
}
ego = (problem_dft *)X(mkproblem)(sizeof(problem_dft), &padt);
ego->sz = X(tensor_compress)(sz);
ego->vecsz = X(tensor_compress_contiguous)(vecsz);
ego->ri = ri;
ego->ii = ii;
ego->ro = ro;
ego->io = io;
A(FINITE_RNK(ego->sz->rnk));
return &(ego->super);
}
/* Same as X(mkproblem_dft), but also destroy input tensors. */
problem *X(mkproblem_dft_d)(tensor *sz, tensor *vecsz,
R *ri, R *ii, R *ro, R *io)
{
problem *p = X(mkproblem_dft)(sz, vecsz, ri, ii, ro, io);
X(tensor_destroy2)(vecsz, sz);
return p;
}
+327
View File
@@ -0,0 +1,327 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
/*
* Compute transforms of prime sizes using Rader's trick: turn them
* into convolutions of size n - 1, which you then perform via a pair
* of FFTs.
*/
typedef struct {
solver super;
} S;
typedef struct {
plan_dft super;
plan *cld1, *cld2;
R *omega;
INT n, g, ginv;
INT is, os;
plan *cld_omega;
} P;
static rader_tl *omegas = 0;
static R *mkomega(enum wakefulness wakefulness, plan *p_, INT n, INT ginv)
{
plan_dft *p = (plan_dft *) p_;
R *omega;
INT i, gpower;
trigreal scale;
triggen *t;
if ((omega = X(rader_tl_find)(n, n, ginv, omegas)))
return omega;
omega = (R *)MALLOC(sizeof(R) * (n - 1) * 2, TWIDDLES);
scale = n - 1.0; /* normalization for convolution */
t = X(mktriggen)(wakefulness, n);
for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
trigreal w[2];
t->cexpl(t, gpower, w);
omega[2*i] = w[0] / scale;
omega[2*i+1] = FFT_SIGN * w[1] / scale;
}
X(triggen_destroy)(t);
A(gpower == 1);
p->apply(p_, omega, omega + 1, omega, omega + 1);
X(rader_tl_insert)(n, n, ginv, omega, &omegas);
return omega;
}
static void free_omega(R *omega)
{
X(rader_tl_delete)(omega, &omegas);
}
/***************************************************************************/
/* Below, we extensively use the identity that fft(x*)* = ifft(x) in
order to share data between forward and backward transforms and to
obviate the necessity of having separate forward and backward
plans. (Although we often compute separate plans these days anyway
due to the differing strides, etcetera.)
Of course, since the new FFTW gives us separate pointers to
the real and imaginary parts, we could have instead used the
fft(r,i) = ifft(i,r) form of this identity, but it was easier to
reuse the code from our old version. */
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT is, os;
INT k, gpower, g, r;
R *buf;
R r0 = ri[0], i0 = ii[0];
r = ego->n; is = ego->is; os = ego->os; g = ego->g;
buf = (R *) MALLOC(sizeof(R) * (r - 1) * 2, BUFFERS);
/* First, permute the input, storing in buf: */
for (gpower = 1, k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) {
R rA, iA;
rA = ri[gpower * is];
iA = ii[gpower * is];
buf[2*k] = rA; buf[2*k + 1] = iA;
}
/* gpower == g^(r-1) mod r == 1 */;
/* compute DFT of buf, storing in output (except DC): */
{
plan_dft *cld = (plan_dft *) ego->cld1;
cld->apply(ego->cld1, buf, buf+1, ro+os, io+os);
}
/* set output DC component: */
{
ro[0] = r0 + ro[os];
io[0] = i0 + io[os];
}
/* now, multiply by omega: */
{
const R *omega = ego->omega;
for (k = 0; k < r - 1; ++k) {
E rB, iB, rW, iW;
rW = omega[2*k];
iW = omega[2*k+1];
rB = ro[(k+1)*os];
iB = io[(k+1)*os];
ro[(k+1)*os] = rW * rB - iW * iB;
io[(k+1)*os] = -(rW * iB + iW * rB);
}
}
/* this will add input[0] to all of the outputs after the ifft */
ro[os] += r0;
io[os] -= i0;
/* inverse FFT: */
{
plan_dft *cld = (plan_dft *) ego->cld2;
cld->apply(ego->cld2, ro+os, io+os, buf, buf+1);
}
/* finally, do inverse permutation to unshuffle the output: */
{
INT ginv = ego->ginv;
gpower = 1;
for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, ginv, r)) {
ro[gpower * os] = buf[2*k];
io[gpower * os] = -buf[2*k+1];
}
A(gpower == 1);
}
X(ifree)(buf);
}
/***************************************************************************/
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
X(plan_awake)(ego->cld_omega, wakefulness);
switch (wakefulness) {
case SLEEPY:
free_omega(ego->omega);
ego->omega = 0;
break;
default:
ego->g = X(find_generator)(ego->n);
ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
ego->omega = mkomega(wakefulness,
ego->cld_omega, ego->n, ego->ginv);
break;
}
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld_omega);
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *)ego_;
p->print(p, "(dft-rader-%D%ois=%oos=%(%p%)",
ego->n, ego->is, ego->os, ego->cld1);
if (ego->cld2 != ego->cld1)
p->print(p, "%(%p%)", ego->cld2);
if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
p->print(p, "%(%p%)", ego->cld_omega);
p->putchr(p, ')');
}
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
UNUSED(ego_);
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
&& X(is_prime)(p->sz->dims[0].n)
/* proclaim the solver SLOW if p-1 is not easily factorizable.
Bluestein should take care of this case. */
&& CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
);
}
static int mkP(P *pln, INT n, INT is, INT os, R *ro, R *io,
planner *plnr)
{
plan *cld1 = (plan *) 0;
plan *cld2 = (plan *) 0;
plan *cld_omega = (plan *) 0;
R *buf = (R *) 0;
/* initial allocation for the purpose of planning */
buf = (R *) MALLOC(sizeof(R) * (n - 1) * 2, BUFFERS);
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, os),
X(mktensor_1d)(1, 0, 0),
buf, buf + 1, ro + os, io + os),
NO_SLOW, 0, 0);
if (!cld1) goto nada;
cld2 = X(mkplan_f_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, os, 2),
X(mktensor_1d)(1, 0, 0),
ro + os, io + os, buf, buf + 1),
NO_SLOW, 0, 0);
if (!cld2) goto nada;
/* plan for omega array */
cld_omega = X(mkplan_f_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, 2),
X(mktensor_1d)(1, 0, 0),
buf, buf + 1, buf, buf + 1),
NO_SLOW, ESTIMATE, 0);
if (!cld_omega) goto nada;
/* deallocate buffers; let awake() or apply() allocate them for real */
X(ifree)(buf);
buf = 0;
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->cld_omega = cld_omega;
pln->omega = 0;
pln->n = n;
pln->is = is;
pln->os = os;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
pln->super.super.ops.other += (n - 1) * (4 * 2 + 6) + 6;
pln->super.super.ops.add += (n - 1) * 2 + 4;
pln->super.super.ops.mul += (n - 1) * 4;
return 1;
nada:
X(ifree0)(buf);
X(plan_destroy_internal)(cld_omega);
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return 0;
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
P *pln;
INT n;
INT is, os;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if (!applicable(ego, p_, plnr))
return (plan *) 0;
n = p->sz->dims[0].n;
is = p->sz->dims[0].is;
os = p->sz->dims[0].os;
pln = MKPLAN_DFT(P, &padt, apply);
if (!mkP(pln, n, is, os, p->ro, p->io, plnr)) {
X(ifree)(pln);
return (plan *) 0;
}
return &(pln->super.super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(dft_rader_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}
+202
View File
@@ -0,0 +1,202 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for DFT of rank >= 2 (multidimensional) */
#include "dft/dft.h"
typedef struct {
solver super;
int spltrnk;
const int *buddies;
size_t nbuddies;
} S;
typedef struct {
plan_dft super;
plan *cld1, *cld2;
const S *solver;
} P;
/* Compute multi-dimensional DFT by applying the two cld plans
(lower-rnk DFTs). */
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
plan_dft *cld1, *cld2;
cld1 = (plan_dft *) ego->cld1;
cld1->apply(ego->cld1, ri, ii, ro, io);
cld2 = (plan_dft *) ego->cld2;
cld2->apply(ego->cld2, ro, io, ro, io);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->solver;
p->print(p, "(dft-rank>=2/%d%(%p%)%(%p%))",
s->spltrnk, ego->cld1, ego->cld2);
}
static int picksplit(const S *ego, const tensor *sz, int *rp)
{
A(sz->rnk > 1); /* cannot split rnk <= 1 */
if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
return 0;
*rp += 1; /* convert from dim. index to rank */
if (*rp >= sz->rnk) /* split must reduce rank */
return 0;
return 1;
}
static int applicable0(const solver *ego_, const problem *p_, int *rp)
{
const problem_dft *p = (const problem_dft *) p_;
const S *ego = (const S *)ego_;
return (1
&& FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
&& p->sz->rnk >= 2
&& picksplit(ego, p->sz, rp)
);
}
/* TODO: revise this. */
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr, int *rp)
{
const S *ego = (const S *)ego_;
const problem_dft *p = (const problem_dft *) p_;
if (!applicable0(ego_, p_, rp)) return 0;
if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0])) return 0;
/* Heuristic: if the vector stride is greater than the transform
sz, don't use (prefer to do the vector loop first with a
vrank-geq1 plan). */
if (NO_UGLYP(plnr))
if (p->vecsz->rnk > 0 &&
X(tensor_min_stride)(p->vecsz) > X(tensor_max_index)(p->sz))
return 0;
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_dft *p;
P *pln;
plan *cld1 = 0, *cld2 = 0;
tensor *sz1, *sz2, *vecszi, *sz2i;
int spltrnk;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr, &spltrnk))
return (plan *) 0;
p = (const problem_dft *) p_;
X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
vecszi = X(tensor_copy_inplace)(p->vecsz, INPLACE_OS);
sz2i = X(tensor_copy_inplace)(sz2, INPLACE_OS);
cld1 = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(tensor_copy)(sz2),
X(tensor_append)(p->vecsz, sz1),
p->ri, p->ii, p->ro, p->io));
if (!cld1) goto nada;
cld2 = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(
X(tensor_copy_inplace)(sz1, INPLACE_OS),
X(tensor_append)(vecszi, sz2i),
p->ro, p->io, p->ro, p->io));
if (!cld2) goto nada;
pln = MKPLAN_DFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->solver = ego;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
X(tensor_destroy4)(sz1, sz2, vecszi, sz2i);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
X(tensor_destroy4)(sz1, sz2, vecszi, sz2i);
return (plan *) 0;
}
static solver *mksolver(int spltrnk, const int *buddies, size_t nbuddies)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->spltrnk = spltrnk;
slv->buddies = buddies;
slv->nbuddies = nbuddies;
return &(slv->super);
}
void X(dft_rank_geq2_register)(planner *p)
{
static const int buddies[] = { 1, 0, -2 };
size_t i;
for (i = 0; i < NELEM(buddies); ++i)
REGISTER_SOLVER(p, mksolver(buddies[i], buddies, NELEM(buddies)));
/* FIXME:
Should we try more buddies?
Another possible variant is to swap cld1 and cld2 (or rather,
to swap their problems; they are not interchangeable because
cld2 must be in-place). In past versions of FFTW, however, I
seem to recall that such rearrangements have made little or no
difference.
*/
}
+6
View File
@@ -0,0 +1,6 @@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS=codelets
noinst_LTLIBRARIES = libdft_scalar.la
libdft_scalar_la_SOURCES = n.c t.c f.h n.h q.h t.h
+757
View File
@@ -0,0 +1,757 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = dft/scalar
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
libdft_scalar_la_LIBADD =
am_libdft_scalar_la_OBJECTS = n.lo t.lo
libdft_scalar_la_OBJECTS = $(am_libdft_scalar_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/n.Plo ./$(DEPDIR)/t.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(libdft_scalar_la_SOURCES)
DIST_SOURCES = $(libdft_scalar_la_SOURCES)
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
ctags-recursive dvi-recursive html-recursive info-recursive \
install-data-recursive install-dvi-recursive \
install-exec-recursive install-html-recursive \
install-info-recursive install-pdf-recursive \
install-ps-recursive install-recursive installcheck-recursive \
installdirs-recursive pdf-recursive ps-recursive \
tags-recursive uninstall-recursive
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
distclean-recursive maintainer-clean-recursive
am__recursive_targets = \
$(RECURSIVE_TARGETS) \
$(RECURSIVE_CLEAN_TARGETS) \
$(am__extra_recursive_targets)
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
distdir distdir-am
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
am__relativize = \
dir0=`pwd`; \
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
sed_rest='s,^[^/]*/*,,'; \
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
sed_butlast='s,/*[^/]*$$,,'; \
while test -n "$$dir1"; do \
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
if test "$$first" != "."; then \
if test "$$first" = ".."; then \
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
else \
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
if test "$$first2" = "$$first"; then \
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
else \
dir2="../$$dir2"; \
fi; \
dir0="$$dir0"/"$$first"; \
fi; \
fi; \
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
done; \
reldir="$$dir2"
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = codelets
noinst_LTLIBRARIES = libdft_scalar.la
libdft_scalar_la_SOURCES = n.c t.c f.h n.h q.h t.h
all: all-recursive
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu dft/scalar/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
libdft_scalar.la: $(libdft_scalar_la_OBJECTS) $(libdft_scalar_la_DEPENDENCIES) $(EXTRA_libdft_scalar_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(libdft_scalar_la_OBJECTS) $(libdft_scalar_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
# This directory's subdirectories are mostly independent; you can cd
# into them and run 'make' without going through this Makefile.
# To change the values of 'make' variables: instead of editing Makefiles,
# (1) if the variable is set in 'config.status', edit 'config.status'
# (which will cause the Makefiles to be regenerated when you run 'make');
# (2) otherwise, pass the desired values on the 'make' command line.
$(am__recursive_targets):
@fail=; \
if $(am__make_keepgoing); then \
failcom='fail=yes'; \
else \
failcom='exit 1'; \
fi; \
dot_seen=no; \
target=`echo $@ | sed s/-recursive//`; \
case "$@" in \
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
*) list='$(SUBDIRS)' ;; \
esac; \
for subdir in $$list; do \
echo "Making $$target in $$subdir"; \
if test "$$subdir" = "."; then \
dot_seen=yes; \
local_target="$$target-am"; \
else \
local_target="$$target"; \
fi; \
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|| eval $$failcom; \
done; \
if test "$$dot_seen" = "no"; then \
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
fi; test -z "$$fail"
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-recursive
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
include_option=--etags-include; \
empty_fix=.; \
else \
include_option=--include; \
empty_fix=; \
fi; \
list='$(SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
test ! -f $$subdir/TAGS || \
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
fi; \
done; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-recursive
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-recursive
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
$(am__make_dryrun) \
|| test -d "$(distdir)/$$subdir" \
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|| exit 1; \
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
$(am__relativize); \
new_distdir=$$reldir; \
dir1=$$subdir; dir2="$(top_distdir)"; \
$(am__relativize); \
new_top_distdir=$$reldir; \
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
($(am__cd) $$subdir && \
$(MAKE) $(AM_MAKEFLAGS) \
top_distdir="$$new_top_distdir" \
distdir="$$new_distdir" \
am__remove_distdir=: \
am__skip_length_check=: \
am__skip_mode_fix=: \
distdir) \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-recursive
all-am: Makefile $(LTLIBRARIES)
installdirs: installdirs-recursive
installdirs-am:
install: install-recursive
install-exec: install-exec-recursive
install-data: install-data-recursive
uninstall: uninstall-recursive
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-recursive
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-recursive
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-recursive
-rm -f ./$(DEPDIR)/n.Plo
-rm -f ./$(DEPDIR)/t.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-recursive
dvi-am:
html: html-recursive
html-am:
info: info-recursive
info-am:
install-data-am:
install-dvi: install-dvi-recursive
install-dvi-am:
install-exec-am:
install-html: install-html-recursive
install-html-am:
install-info: install-info-recursive
install-info-am:
install-man:
install-pdf: install-pdf-recursive
install-pdf-am:
install-ps: install-ps-recursive
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-recursive
-rm -f ./$(DEPDIR)/n.Plo
-rm -f ./$(DEPDIR)/t.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-recursive
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-recursive
pdf-am:
ps: ps-recursive
ps-am:
uninstall-am:
.MAKE: $(am__recursive_targets) install-am install-strip
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
am--depfiles check check-am clean clean-generic clean-libtool \
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
distclean-compile distclean-generic distclean-libtool \
distclean-tags distdir dvi dvi-am html html-am info info-am \
install install-am install-data install-data-am install-dvi \
install-dvi-am install-exec install-exec-am install-html \
install-html-am install-info install-info-am install-man \
install-pdf install-pdf-am install-ps install-ps-am \
install-strip installcheck installcheck-am installdirs \
installdirs-am maintainer-clean maintainer-clean-generic \
mostlyclean mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
@@ -0,0 +1,96 @@
# This Makefile.am specifies a set of codelets, efficient transforms
# of small sizes, that are used as building blocks (kernels) by FFTW
# to build up large transforms, as well as the options for generating
# and compiling them.
# You can customize FFTW for special needs, e.g. to handle certain
# sizes more efficiently, by adding new codelets to the lists of those
# included by default. If you change the list of codelets, any new
# ones you added will be automatically generated when you run the
# bootstrap script (see "Generating your own code" in the FFTW
# manual).
###########################################################################
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = libdft_scalar_codelets.la
###########################################################################
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
###########################################################################
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
# of trig. functions, it partially generates the trig. values on the fly
# (this is faster for large sizes).
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
t2_5.c t2_10.c t2_20.c t2_25.c
###########################################################################
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
# but the planner seems to never (or hardly ever) use them on the machines
# we have access to, preferring the Q codelets and the use of buffers
# for sub-transforms. So, we comment them out, at least for now.
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
# like f1, but partially generates its trig. table on the fly
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
###########################################################################
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
# transposed. This is used for in-place transposes in sizes that are
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
# probably not use <r> bigger than 8 or so.
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
###########################################################################
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
SOLVTAB_NAME = X(solvtab_dft_standard)
XRENAME=X
# special rules for regenerating codelets.
include $(top_srcdir)/support/Makefile.codelets
if MAINTAINER_MODE
FLAGS_N1=$(DFT_FLAGS_COMMON)
FLAGS_T1=$(DFT_FLAGS_COMMON)
FLAGS_T2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
FLAGS_F1=$(DFT_FLAGS_COMMON)
FLAGS_F2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
FLAGS_Q1=$(DFT_FLAGS_COMMON) -reload-twiddle
FLAGS_Q2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
endif # MAINTAINER_MODE
+994
View File
@@ -0,0 +1,994 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
# This Makefile.am specifies a set of codelets, efficient transforms
# of small sizes, that are used as building blocks (kernels) by FFTW
# to build up large transforms, as well as the options for generating
# and compiling them.
# You can customize FFTW for special needs, e.g. to handle certain
# sizes more efficiently, by adding new codelets to the lists of those
# included by default. If you change the list of codelets, any new
# ones you added will be automatically generated when you run the
# bootstrap script (see "Generating your own code" in the FFTW
# manual).
# -*- makefile -*-
# This file contains special make rules to generate codelets.
# Most of this file requires GNU make .
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = dft/scalar/codelets
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
libdft_scalar_codelets_la_LIBADD =
am__objects_1 = n1_2.lo n1_3.lo n1_4.lo n1_5.lo n1_6.lo n1_7.lo \
n1_8.lo n1_9.lo n1_10.lo n1_11.lo n1_12.lo n1_13.lo n1_14.lo \
n1_15.lo n1_16.lo n1_32.lo n1_64.lo n1_20.lo n1_25.lo
am__objects_2 = t1_2.lo t1_3.lo t1_4.lo t1_5.lo t1_6.lo t1_7.lo \
t1_8.lo t1_9.lo t1_10.lo t1_12.lo t1_15.lo t1_16.lo t1_32.lo \
t1_64.lo t1_20.lo t1_25.lo
am__objects_3 = t2_4.lo t2_8.lo t2_16.lo t2_32.lo t2_64.lo t2_5.lo \
t2_10.lo t2_20.lo t2_25.lo
am__objects_4 =
am__objects_5 = q1_2.lo q1_4.lo q1_8.lo q1_3.lo q1_5.lo q1_6.lo
am__objects_6 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
$(am__objects_4) $(am__objects_4) $(am__objects_5)
am__objects_7 = codlist.lo
am__objects_8 = $(am__objects_6) $(am__objects_7)
am_libdft_scalar_codelets_la_OBJECTS = $(am__objects_8)
libdft_scalar_codelets_la_OBJECTS = \
$(am_libdft_scalar_codelets_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/codlist.Plo ./$(DEPDIR)/n1_10.Plo \
./$(DEPDIR)/n1_11.Plo ./$(DEPDIR)/n1_12.Plo \
./$(DEPDIR)/n1_13.Plo ./$(DEPDIR)/n1_14.Plo \
./$(DEPDIR)/n1_15.Plo ./$(DEPDIR)/n1_16.Plo \
./$(DEPDIR)/n1_2.Plo ./$(DEPDIR)/n1_20.Plo \
./$(DEPDIR)/n1_25.Plo ./$(DEPDIR)/n1_3.Plo \
./$(DEPDIR)/n1_32.Plo ./$(DEPDIR)/n1_4.Plo \
./$(DEPDIR)/n1_5.Plo ./$(DEPDIR)/n1_6.Plo \
./$(DEPDIR)/n1_64.Plo ./$(DEPDIR)/n1_7.Plo \
./$(DEPDIR)/n1_8.Plo ./$(DEPDIR)/n1_9.Plo ./$(DEPDIR)/q1_2.Plo \
./$(DEPDIR)/q1_3.Plo ./$(DEPDIR)/q1_4.Plo ./$(DEPDIR)/q1_5.Plo \
./$(DEPDIR)/q1_6.Plo ./$(DEPDIR)/q1_8.Plo \
./$(DEPDIR)/t1_10.Plo ./$(DEPDIR)/t1_12.Plo \
./$(DEPDIR)/t1_15.Plo ./$(DEPDIR)/t1_16.Plo \
./$(DEPDIR)/t1_2.Plo ./$(DEPDIR)/t1_20.Plo \
./$(DEPDIR)/t1_25.Plo ./$(DEPDIR)/t1_3.Plo \
./$(DEPDIR)/t1_32.Plo ./$(DEPDIR)/t1_4.Plo \
./$(DEPDIR)/t1_5.Plo ./$(DEPDIR)/t1_6.Plo \
./$(DEPDIR)/t1_64.Plo ./$(DEPDIR)/t1_7.Plo \
./$(DEPDIR)/t1_8.Plo ./$(DEPDIR)/t1_9.Plo \
./$(DEPDIR)/t2_10.Plo ./$(DEPDIR)/t2_16.Plo \
./$(DEPDIR)/t2_20.Plo ./$(DEPDIR)/t2_25.Plo \
./$(DEPDIR)/t2_32.Plo ./$(DEPDIR)/t2_4.Plo \
./$(DEPDIR)/t2_5.Plo ./$(DEPDIR)/t2_64.Plo \
./$(DEPDIR)/t2_8.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(libdft_scalar_codelets_la_SOURCES)
DIST_SOURCES = $(libdft_scalar_codelets_la_SOURCES)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp \
$(top_srcdir)/support/Makefile.codelets
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
###########################################################################
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = libdft_scalar_codelets.la
###########################################################################
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
###########################################################################
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
# of trig. functions, it partially generates the trig. values on the fly
# (this is faster for large sizes).
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
t2_5.c t2_10.c t2_20.c t2_25.c
###########################################################################
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
# but the planner seems to never (or hardly ever) use them on the machines
# we have access to, preferring the Q codelets and the use of buffers
# for sub-transforms. So, we comment them out, at least for now.
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
# like f1, but partially generates its trig. table on the fly
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
###########################################################################
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
# transposed. This is used for in-place transposes in sizes that are
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
# probably not use <r> bigger than 8 or so.
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
###########################################################################
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
SOLVTAB_NAME = X(solvtab_dft_standard)
XRENAME = X
CODLIST = codlist.c
CODELET_NAME = codelet_
#INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE)
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
# special rules for regenerating codelets.
@MAINTAINER_MODE_TRUE@FLAGS_N1 = $(DFT_FLAGS_COMMON)
@MAINTAINER_MODE_TRUE@FLAGS_T1 = $(DFT_FLAGS_COMMON)
@MAINTAINER_MODE_TRUE@FLAGS_T2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
@MAINTAINER_MODE_TRUE@FLAGS_F1 = $(DFT_FLAGS_COMMON)
@MAINTAINER_MODE_TRUE@FLAGS_F2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
@MAINTAINER_MODE_TRUE@FLAGS_Q1 = $(DFT_FLAGS_COMMON) -reload-twiddle
@MAINTAINER_MODE_TRUE@FLAGS_Q2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
all: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) all-am
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/codelets/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu dft/scalar/codelets/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_srcdir)/support/Makefile.codelets $(am__empty):
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
libdft_scalar_codelets.la: $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_DEPENDENCIES) $(EXTRA_libdft_scalar_codelets_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_10.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_11.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_12.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_13.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_14.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_15.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_16.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_20.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_25.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_3.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_32.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_6.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_64.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_7.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_8.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_9.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_3.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_6.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_8.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_10.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_12.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_15.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_16.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_20.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_25.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_3.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_32.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_6.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_64.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_7.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_8.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_9.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_10.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_16.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_20.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_25.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_32.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_64.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_8.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-am
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-am
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-am
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
check-am: all-am
check: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) check-am
all-am: Makefile $(LTLIBRARIES)
installdirs:
install: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) install-am
install-exec: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) install-exec-am
install-data: install-data-am
uninstall: uninstall-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-am
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
clean: clean-am
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-am
-rm -f ./$(DEPDIR)/codlist.Plo
-rm -f ./$(DEPDIR)/n1_10.Plo
-rm -f ./$(DEPDIR)/n1_11.Plo
-rm -f ./$(DEPDIR)/n1_12.Plo
-rm -f ./$(DEPDIR)/n1_13.Plo
-rm -f ./$(DEPDIR)/n1_14.Plo
-rm -f ./$(DEPDIR)/n1_15.Plo
-rm -f ./$(DEPDIR)/n1_16.Plo
-rm -f ./$(DEPDIR)/n1_2.Plo
-rm -f ./$(DEPDIR)/n1_20.Plo
-rm -f ./$(DEPDIR)/n1_25.Plo
-rm -f ./$(DEPDIR)/n1_3.Plo
-rm -f ./$(DEPDIR)/n1_32.Plo
-rm -f ./$(DEPDIR)/n1_4.Plo
-rm -f ./$(DEPDIR)/n1_5.Plo
-rm -f ./$(DEPDIR)/n1_6.Plo
-rm -f ./$(DEPDIR)/n1_64.Plo
-rm -f ./$(DEPDIR)/n1_7.Plo
-rm -f ./$(DEPDIR)/n1_8.Plo
-rm -f ./$(DEPDIR)/n1_9.Plo
-rm -f ./$(DEPDIR)/q1_2.Plo
-rm -f ./$(DEPDIR)/q1_3.Plo
-rm -f ./$(DEPDIR)/q1_4.Plo
-rm -f ./$(DEPDIR)/q1_5.Plo
-rm -f ./$(DEPDIR)/q1_6.Plo
-rm -f ./$(DEPDIR)/q1_8.Plo
-rm -f ./$(DEPDIR)/t1_10.Plo
-rm -f ./$(DEPDIR)/t1_12.Plo
-rm -f ./$(DEPDIR)/t1_15.Plo
-rm -f ./$(DEPDIR)/t1_16.Plo
-rm -f ./$(DEPDIR)/t1_2.Plo
-rm -f ./$(DEPDIR)/t1_20.Plo
-rm -f ./$(DEPDIR)/t1_25.Plo
-rm -f ./$(DEPDIR)/t1_3.Plo
-rm -f ./$(DEPDIR)/t1_32.Plo
-rm -f ./$(DEPDIR)/t1_4.Plo
-rm -f ./$(DEPDIR)/t1_5.Plo
-rm -f ./$(DEPDIR)/t1_6.Plo
-rm -f ./$(DEPDIR)/t1_64.Plo
-rm -f ./$(DEPDIR)/t1_7.Plo
-rm -f ./$(DEPDIR)/t1_8.Plo
-rm -f ./$(DEPDIR)/t1_9.Plo
-rm -f ./$(DEPDIR)/t2_10.Plo
-rm -f ./$(DEPDIR)/t2_16.Plo
-rm -f ./$(DEPDIR)/t2_20.Plo
-rm -f ./$(DEPDIR)/t2_25.Plo
-rm -f ./$(DEPDIR)/t2_32.Plo
-rm -f ./$(DEPDIR)/t2_4.Plo
-rm -f ./$(DEPDIR)/t2_5.Plo
-rm -f ./$(DEPDIR)/t2_64.Plo
-rm -f ./$(DEPDIR)/t2_8.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-am
dvi-am:
html: html-am
html-am:
info: info-am
info-am:
install-data-am:
install-dvi: install-dvi-am
install-dvi-am:
install-exec-am:
install-html: install-html-am
install-html-am:
install-info: install-info-am
install-info-am:
install-man:
install-pdf: install-pdf-am
install-pdf-am:
install-ps: install-ps-am
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-am
-rm -f ./$(DEPDIR)/codlist.Plo
-rm -f ./$(DEPDIR)/n1_10.Plo
-rm -f ./$(DEPDIR)/n1_11.Plo
-rm -f ./$(DEPDIR)/n1_12.Plo
-rm -f ./$(DEPDIR)/n1_13.Plo
-rm -f ./$(DEPDIR)/n1_14.Plo
-rm -f ./$(DEPDIR)/n1_15.Plo
-rm -f ./$(DEPDIR)/n1_16.Plo
-rm -f ./$(DEPDIR)/n1_2.Plo
-rm -f ./$(DEPDIR)/n1_20.Plo
-rm -f ./$(DEPDIR)/n1_25.Plo
-rm -f ./$(DEPDIR)/n1_3.Plo
-rm -f ./$(DEPDIR)/n1_32.Plo
-rm -f ./$(DEPDIR)/n1_4.Plo
-rm -f ./$(DEPDIR)/n1_5.Plo
-rm -f ./$(DEPDIR)/n1_6.Plo
-rm -f ./$(DEPDIR)/n1_64.Plo
-rm -f ./$(DEPDIR)/n1_7.Plo
-rm -f ./$(DEPDIR)/n1_8.Plo
-rm -f ./$(DEPDIR)/n1_9.Plo
-rm -f ./$(DEPDIR)/q1_2.Plo
-rm -f ./$(DEPDIR)/q1_3.Plo
-rm -f ./$(DEPDIR)/q1_4.Plo
-rm -f ./$(DEPDIR)/q1_5.Plo
-rm -f ./$(DEPDIR)/q1_6.Plo
-rm -f ./$(DEPDIR)/q1_8.Plo
-rm -f ./$(DEPDIR)/t1_10.Plo
-rm -f ./$(DEPDIR)/t1_12.Plo
-rm -f ./$(DEPDIR)/t1_15.Plo
-rm -f ./$(DEPDIR)/t1_16.Plo
-rm -f ./$(DEPDIR)/t1_2.Plo
-rm -f ./$(DEPDIR)/t1_20.Plo
-rm -f ./$(DEPDIR)/t1_25.Plo
-rm -f ./$(DEPDIR)/t1_3.Plo
-rm -f ./$(DEPDIR)/t1_32.Plo
-rm -f ./$(DEPDIR)/t1_4.Plo
-rm -f ./$(DEPDIR)/t1_5.Plo
-rm -f ./$(DEPDIR)/t1_6.Plo
-rm -f ./$(DEPDIR)/t1_64.Plo
-rm -f ./$(DEPDIR)/t1_7.Plo
-rm -f ./$(DEPDIR)/t1_8.Plo
-rm -f ./$(DEPDIR)/t1_9.Plo
-rm -f ./$(DEPDIR)/t2_10.Plo
-rm -f ./$(DEPDIR)/t2_16.Plo
-rm -f ./$(DEPDIR)/t2_20.Plo
-rm -f ./$(DEPDIR)/t2_25.Plo
-rm -f ./$(DEPDIR)/t2_32.Plo
-rm -f ./$(DEPDIR)/t2_4.Plo
-rm -f ./$(DEPDIR)/t2_5.Plo
-rm -f ./$(DEPDIR)/t2_64.Plo
-rm -f ./$(DEPDIR)/t2_8.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic \
maintainer-clean-local
mostlyclean: mostlyclean-am
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-am
pdf-am:
ps: ps-am
ps-am:
uninstall-am:
.MAKE: all check install install-am install-exec install-strip
.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
clean-generic clean-libtool clean-noinstLTLIBRARIES \
cscopelist-am ctags ctags-am distclean distclean-compile \
distclean-generic distclean-libtool distclean-tags distdir dvi \
dvi-am html html-am info info-am install install-am \
install-data install-data-am install-dvi install-dvi-am \
install-exec install-exec-am install-html install-html-am \
install-info install-info-am install-man install-pdf \
install-pdf-am install-ps install-ps-am install-strip \
installcheck installcheck-am installdirs maintainer-clean \
maintainer-clean-generic maintainer-clean-local mostlyclean \
mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am
.PRECIOUS: Makefile
# only delete codlist.c in maintainer-mode, since it is included in the dist
# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
maintainer-clean-local:
rm -f $(CODLIST)
# rule to build codlist
@MAINTAINER_MODE_TRUE@$(CODLIST): Makefile
@MAINTAINER_MODE_TRUE@ ( \
@MAINTAINER_MODE_TRUE@ echo "#include \"kernel/ifftw.h\""; \
@MAINTAINER_MODE_TRUE@ echo $(INCLUDE_SIMD_HEADER); \
@MAINTAINER_MODE_TRUE@ echo; \
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
@MAINTAINER_MODE_TRUE@ echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);"; \
@MAINTAINER_MODE_TRUE@ fi \
@MAINTAINER_MODE_TRUE@ done; \
@MAINTAINER_MODE_TRUE@ echo; \
@MAINTAINER_MODE_TRUE@ echo; \
@MAINTAINER_MODE_TRUE@ echo "extern const solvtab $(SOLVTAB_NAME);"; \
@MAINTAINER_MODE_TRUE@ echo "const solvtab $(SOLVTAB_NAME) = {"; \
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),"; \
@MAINTAINER_MODE_TRUE@ fi \
@MAINTAINER_MODE_TRUE@ done; \
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB_END"; \
@MAINTAINER_MODE_TRUE@ echo "};"; \
@MAINTAINER_MODE_TRUE@ ) >$@
# cancel the hideous builtin rules that cause an infinite loop
@MAINTAINER_MODE_TRUE@%: %.o
@MAINTAINER_MODE_TRUE@%: %.s
@MAINTAINER_MODE_TRUE@%: %.c
@MAINTAINER_MODE_TRUE@%: %.S
@MAINTAINER_MODE_TRUE@n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
+109
View File
@@ -0,0 +1,109 @@
#include "kernel/ifftw.h"
extern void X(codelet_n1_2)(planner *);
extern void X(codelet_n1_3)(planner *);
extern void X(codelet_n1_4)(planner *);
extern void X(codelet_n1_5)(planner *);
extern void X(codelet_n1_6)(planner *);
extern void X(codelet_n1_7)(planner *);
extern void X(codelet_n1_8)(planner *);
extern void X(codelet_n1_9)(planner *);
extern void X(codelet_n1_10)(planner *);
extern void X(codelet_n1_11)(planner *);
extern void X(codelet_n1_12)(planner *);
extern void X(codelet_n1_13)(planner *);
extern void X(codelet_n1_14)(planner *);
extern void X(codelet_n1_15)(planner *);
extern void X(codelet_n1_16)(planner *);
extern void X(codelet_n1_32)(planner *);
extern void X(codelet_n1_64)(planner *);
extern void X(codelet_n1_20)(planner *);
extern void X(codelet_n1_25)(planner *);
extern void X(codelet_t1_2)(planner *);
extern void X(codelet_t1_3)(planner *);
extern void X(codelet_t1_4)(planner *);
extern void X(codelet_t1_5)(planner *);
extern void X(codelet_t1_6)(planner *);
extern void X(codelet_t1_7)(planner *);
extern void X(codelet_t1_8)(planner *);
extern void X(codelet_t1_9)(planner *);
extern void X(codelet_t1_10)(planner *);
extern void X(codelet_t1_12)(planner *);
extern void X(codelet_t1_15)(planner *);
extern void X(codelet_t1_16)(planner *);
extern void X(codelet_t1_32)(planner *);
extern void X(codelet_t1_64)(planner *);
extern void X(codelet_t1_20)(planner *);
extern void X(codelet_t1_25)(planner *);
extern void X(codelet_t2_4)(planner *);
extern void X(codelet_t2_8)(planner *);
extern void X(codelet_t2_16)(planner *);
extern void X(codelet_t2_32)(planner *);
extern void X(codelet_t2_64)(planner *);
extern void X(codelet_t2_5)(planner *);
extern void X(codelet_t2_10)(planner *);
extern void X(codelet_t2_20)(planner *);
extern void X(codelet_t2_25)(planner *);
extern void X(codelet_q1_2)(planner *);
extern void X(codelet_q1_4)(planner *);
extern void X(codelet_q1_8)(planner *);
extern void X(codelet_q1_3)(planner *);
extern void X(codelet_q1_5)(planner *);
extern void X(codelet_q1_6)(planner *);
extern const solvtab X(solvtab_dft_standard);
const solvtab X(solvtab_dft_standard) = {
SOLVTAB(X(codelet_n1_2)),
SOLVTAB(X(codelet_n1_3)),
SOLVTAB(X(codelet_n1_4)),
SOLVTAB(X(codelet_n1_5)),
SOLVTAB(X(codelet_n1_6)),
SOLVTAB(X(codelet_n1_7)),
SOLVTAB(X(codelet_n1_8)),
SOLVTAB(X(codelet_n1_9)),
SOLVTAB(X(codelet_n1_10)),
SOLVTAB(X(codelet_n1_11)),
SOLVTAB(X(codelet_n1_12)),
SOLVTAB(X(codelet_n1_13)),
SOLVTAB(X(codelet_n1_14)),
SOLVTAB(X(codelet_n1_15)),
SOLVTAB(X(codelet_n1_16)),
SOLVTAB(X(codelet_n1_32)),
SOLVTAB(X(codelet_n1_64)),
SOLVTAB(X(codelet_n1_20)),
SOLVTAB(X(codelet_n1_25)),
SOLVTAB(X(codelet_t1_2)),
SOLVTAB(X(codelet_t1_3)),
SOLVTAB(X(codelet_t1_4)),
SOLVTAB(X(codelet_t1_5)),
SOLVTAB(X(codelet_t1_6)),
SOLVTAB(X(codelet_t1_7)),
SOLVTAB(X(codelet_t1_8)),
SOLVTAB(X(codelet_t1_9)),
SOLVTAB(X(codelet_t1_10)),
SOLVTAB(X(codelet_t1_12)),
SOLVTAB(X(codelet_t1_15)),
SOLVTAB(X(codelet_t1_16)),
SOLVTAB(X(codelet_t1_32)),
SOLVTAB(X(codelet_t1_64)),
SOLVTAB(X(codelet_t1_20)),
SOLVTAB(X(codelet_t1_25)),
SOLVTAB(X(codelet_t2_4)),
SOLVTAB(X(codelet_t2_8)),
SOLVTAB(X(codelet_t2_16)),
SOLVTAB(X(codelet_t2_32)),
SOLVTAB(X(codelet_t2_64)),
SOLVTAB(X(codelet_t2_5)),
SOLVTAB(X(codelet_t2_10)),
SOLVTAB(X(codelet_t2_20)),
SOLVTAB(X(codelet_t2_25)),
SOLVTAB(X(codelet_q1_2)),
SOLVTAB(X(codelet_q1_4)),
SOLVTAB(X(codelet_q1_8)),
SOLVTAB(X(codelet_q1_3)),
SOLVTAB(X(codelet_q1_5)),
SOLVTAB(X(codelet_q1_6)),
SOLVTAB_END
};
+362
View File
@@ -0,0 +1,362 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
/*
* This function contains 84 FP additions, 36 FP multiplications,
* (or, 48 additions, 0 multiplications, 36 fused multiply/add),
* 41 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
E T3, Tj, TN, T1b, TU, TV, T1j, T1i, Tm, Tp, Tq, Ta, Th, Ti, TA;
E TH, T17, T14, T1c, T1d, T1e, TO, TP, TQ;
{
E T1, T2, TL, TM;
T1 = ri[0];
T2 = ri[WS(is, 5)];
T3 = T1 - T2;
Tj = T1 + T2;
TL = ii[0];
TM = ii[WS(is, 5)];
TN = TL - TM;
T1b = TL + TM;
}
{
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
{
E T4, T5, Te, Tf;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 7)];
T6 = T4 - T5;
Tk = T4 + T5;
Te = ri[WS(is, 6)];
Tf = ri[WS(is, 1)];
Tg = Te - Tf;
To = Te + Tf;
}
{
E T7, T8, Tb, Tc;
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 3)];
T9 = T7 - T8;
Tl = T7 + T8;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 9)];
Td = Tb - Tc;
Tn = Tb + Tc;
}
TU = T6 - T9;
TV = Td - Tg;
T1j = Tk - Tl;
T1i = Tn - To;
Tm = Tk + Tl;
Tp = Tn + To;
Tq = Tm + Tp;
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
}
{
E Tw, T15, TG, T13, Tz, T16, TD, T12;
{
E Tu, Tv, TE, TF;
Tu = ii[WS(is, 2)];
Tv = ii[WS(is, 7)];
Tw = Tu - Tv;
T15 = Tu + Tv;
TE = ii[WS(is, 6)];
TF = ii[WS(is, 1)];
TG = TE - TF;
T13 = TE + TF;
}
{
E Tx, Ty, TB, TC;
Tx = ii[WS(is, 8)];
Ty = ii[WS(is, 3)];
Tz = Tx - Ty;
T16 = Tx + Ty;
TB = ii[WS(is, 4)];
TC = ii[WS(is, 9)];
TD = TB - TC;
T12 = TB + TC;
}
TA = Tw - Tz;
TH = TD - TG;
T17 = T15 - T16;
T14 = T12 - T13;
T1c = T15 + T16;
T1d = T12 + T13;
T1e = T1c + T1d;
TO = Tw + Tz;
TP = TD + TG;
TQ = TO + TP;
}
ro[WS(os, 5)] = T3 + Ti;
io[WS(os, 5)] = TN + TQ;
ro[0] = Tj + Tq;
io[0] = T1b + T1e;
{
E TI, TK, Tt, TJ, Tr, Ts;
TI = FMA(KP618033988, TH, TA);
TK = FNMS(KP618033988, TA, TH);
Tr = FNMS(KP250000000, Ti, T3);
Ts = Ta - Th;
Tt = FMA(KP559016994, Ts, Tr);
TJ = FNMS(KP559016994, Ts, Tr);
ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
}
{
E TW, TY, TT, TX, TR, TS;
TW = FMA(KP618033988, TV, TU);
TY = FNMS(KP618033988, TU, TV);
TR = FNMS(KP250000000, TQ, TN);
TS = TO - TP;
TT = FMA(KP559016994, TS, TR);
TX = FNMS(KP559016994, TS, TR);
io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
io[WS(os, 7)] = FMA(KP951056516, TY, TX);
io[WS(os, 9)] = FMA(KP951056516, TW, TT);
io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
}
{
E T18, T1a, T11, T19, TZ, T10;
T18 = FNMS(KP618033988, T17, T14);
T1a = FMA(KP618033988, T14, T17);
TZ = FNMS(KP250000000, Tq, Tj);
T10 = Tm - Tp;
T11 = FNMS(KP559016994, T10, TZ);
T19 = FMA(KP559016994, T10, TZ);
ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
}
{
E T1k, T1m, T1h, T1l, T1f, T1g;
T1k = FNMS(KP618033988, T1j, T1i);
T1m = FMA(KP618033988, T1i, T1j);
T1f = FNMS(KP250000000, T1e, T1b);
T1g = T1c - T1d;
T1h = FNMS(KP559016994, T1g, T1f);
T1l = FMA(KP559016994, T1g, T1f);
io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
}
}
}
}
static const kdft_desc desc = { 10, "n1_10", { 48, 0, 36, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
/*
* This function contains 84 FP additions, 24 FP multiplications,
* (or, 72 additions, 12 multiplications, 12 fused multiply/add),
* 41 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
{
E T1, T2, TO, TP;
T1 = ri[0];
T2 = ri[WS(is, 5)];
T3 = T1 - T2;
Tj = T1 + T2;
TO = ii[0];
TP = ii[WS(is, 5)];
TQ = TO - TP;
T1e = TO + TP;
}
{
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
{
E T4, T5, Te, Tf;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 7)];
T6 = T4 - T5;
Tk = T4 + T5;
Te = ri[WS(is, 6)];
Tf = ri[WS(is, 1)];
Tg = Te - Tf;
To = Te + Tf;
}
{
E T7, T8, Tb, Tc;
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 3)];
T9 = T7 - T8;
Tl = T7 + T8;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 9)];
Td = Tb - Tc;
Tn = Tb + Tc;
}
TU = T6 - T9;
TV = Td - Tg;
T1c = Tk - Tl;
T1b = Tn - To;
Tm = Tk + Tl;
Tp = Tn + To;
Tq = Tm + Tp;
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
}
{
E Tw, T15, TG, T13, Tz, T16, TD, T12;
{
E Tu, Tv, TE, TF;
Tu = ii[WS(is, 2)];
Tv = ii[WS(is, 7)];
Tw = Tu - Tv;
T15 = Tu + Tv;
TE = ii[WS(is, 6)];
TF = ii[WS(is, 1)];
TG = TE - TF;
T13 = TE + TF;
}
{
E Tx, Ty, TB, TC;
Tx = ii[WS(is, 8)];
Ty = ii[WS(is, 3)];
Tz = Tx - Ty;
T16 = Tx + Ty;
TB = ii[WS(is, 4)];
TC = ii[WS(is, 9)];
TD = TB - TC;
T12 = TB + TC;
}
TA = Tw - Tz;
TH = TD - TG;
T17 = T15 - T16;
T14 = T12 - T13;
T1f = T15 + T16;
T1g = T12 + T13;
T1h = T1f + T1g;
TL = Tw + Tz;
TM = TD + TG;
TR = TL + TM;
}
ro[WS(os, 5)] = T3 + Ti;
io[WS(os, 5)] = TQ + TR;
ro[0] = Tj + Tq;
io[0] = T1e + T1h;
{
E TI, TK, Tt, TJ, Tr, Ts;
TI = FMA(KP951056516, TA, KP587785252 * TH);
TK = FNMS(KP587785252, TA, KP951056516 * TH);
Tr = KP559016994 * (Ta - Th);
Ts = FNMS(KP250000000, Ti, T3);
Tt = Tr + Ts;
TJ = Ts - Tr;
ro[WS(os, 9)] = Tt - TI;
ro[WS(os, 3)] = TJ + TK;
ro[WS(os, 1)] = Tt + TI;
ro[WS(os, 7)] = TJ - TK;
}
{
E TW, TY, TT, TX, TN, TS;
TW = FMA(KP951056516, TU, KP587785252 * TV);
TY = FNMS(KP587785252, TU, KP951056516 * TV);
TN = KP559016994 * (TL - TM);
TS = FNMS(KP250000000, TR, TQ);
TT = TN + TS;
TX = TS - TN;
io[WS(os, 1)] = TT - TW;
io[WS(os, 7)] = TY + TX;
io[WS(os, 9)] = TW + TT;
io[WS(os, 3)] = TX - TY;
}
{
E T18, T1a, T11, T19, TZ, T10;
T18 = FNMS(KP587785252, T17, KP951056516 * T14);
T1a = FMA(KP951056516, T17, KP587785252 * T14);
TZ = FNMS(KP250000000, Tq, Tj);
T10 = KP559016994 * (Tm - Tp);
T11 = TZ - T10;
T19 = T10 + TZ;
ro[WS(os, 2)] = T11 - T18;
ro[WS(os, 6)] = T19 + T1a;
ro[WS(os, 8)] = T11 + T18;
ro[WS(os, 4)] = T19 - T1a;
}
{
E T1d, T1l, T1k, T1m, T1i, T1j;
T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
T1i = FNMS(KP250000000, T1h, T1e);
T1j = KP559016994 * (T1f - T1g);
T1k = T1i - T1j;
T1m = T1j + T1i;
io[WS(os, 2)] = T1d + T1k;
io[WS(os, 6)] = T1m - T1l;
io[WS(os, 8)] = T1k - T1d;
io[WS(os, 4)] = T1l + T1m;
}
}
}
}
static const kdft_desc desc = { 10, "n1_10", { 72, 12, 12, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
}
#endif
+426
View File
@@ -0,0 +1,426 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
/*
* This function contains 140 FP additions, 110 FP multiplications,
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
* 62 stack variables, 10 constants, and 44 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
DK(KP918985947, +0.918985947228994779780736114132655398124909697);
DK(KP830830026, +0.830830026003772851058548298459246407048009821);
DK(KP876768831, +0.876768831002589333891339807079336796764054852);
DK(KP778434453, +0.778434453334651800608337670740821884709317477);
DK(KP715370323, +0.715370323453429719112414662767260662417897278);
DK(KP521108558, +0.521108558113202722944698153526659300680427422);
DK(KP634356270, +0.634356270682424498893150776899916060542806975);
DK(KP342584725, +0.342584725681637509502641509861112333758894680);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
E T1, T1f, T4, T1u, Tg, T1q, T7, T1t, Ta, T1s, Td, T1r, Ti, TP, T26;
E TG, T1X, T1O, T1w, TY, T1F, T17, To, T1i, TA, T1k, Tr, T1h, Tu, T1j;
E Tx, T1g, TC, TU, T21, TL, T1S, T1J, T1m, T13, T1A, T1c;
T1 = ri[0];
T1f = ii[0];
{
E T5, T6, Tp, Tq;
{
E T2, T3, Te, Tf;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
T1u = T3 - T2;
Te = ri[WS(is, 5)];
Tf = ri[WS(is, 6)];
Tg = Te + Tf;
T1q = Tf - Te;
}
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 9)];
T7 = T5 + T6;
T1t = T6 - T5;
{
E T8, T9, Tb, Tc;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 8)];
Ta = T8 + T9;
T1s = T9 - T8;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 7)];
Td = Tb + Tc;
T1r = Tc - Tb;
}
{
E Th, TO, T25, TF, T1W;
Th = FNMS(KP342584725, Ta, T7);
Ti = FNMS(KP634356270, Th, Td);
TO = FNMS(KP342584725, T4, Ta);
TP = FNMS(KP634356270, TO, Tg);
T25 = FMA(KP521108558, T1q, T1u);
T26 = FMA(KP715370323, T25, T1r);
TF = FNMS(KP342584725, Td, T4);
TG = FNMS(KP634356270, TF, T7);
T1W = FMA(KP521108558, T1s, T1q);
T1X = FNMS(KP715370323, T1W, T1t);
}
{
E T1N, T1v, TX, T1E, T16;
T1N = FNMS(KP521108558, T1t, T1r);
T1O = FMA(KP715370323, T1N, T1q);
T1v = FNMS(KP521108558, T1u, T1t);
T1w = FNMS(KP715370323, T1v, T1s);
TX = FNMS(KP342584725, T7, Tg);
TY = FNMS(KP634356270, TX, T4);
T1E = FMA(KP521108558, T1r, T1s);
T1F = FMA(KP715370323, T1E, T1u);
T16 = FNMS(KP342584725, Tg, Td);
T17 = FNMS(KP634356270, T16, Ta);
}
{
E Tm, Tn, Ty, Tz;
Tm = ii[WS(is, 3)];
Tn = ii[WS(is, 8)];
To = Tm - Tn;
T1i = Tm + Tn;
Ty = ii[WS(is, 5)];
Tz = ii[WS(is, 6)];
TA = Ty - Tz;
T1k = Ty + Tz;
}
Tp = ii[WS(is, 2)];
Tq = ii[WS(is, 9)];
Tr = Tp - Tq;
T1h = Tp + Tq;
{
E Ts, Tt, Tv, Tw;
Ts = ii[WS(is, 4)];
Tt = ii[WS(is, 7)];
Tu = Ts - Tt;
T1j = Ts + Tt;
Tv = ii[WS(is, 1)];
Tw = ii[WS(is, 10)];
Tx = Tv - Tw;
T1g = Tv + Tw;
}
{
E TB, TT, T20, TK, T1R;
TB = FMA(KP521108558, TA, Tx);
TC = FMA(KP715370323, TB, Tu);
TT = FNMS(KP521108558, Tr, Tu);
TU = FMA(KP715370323, TT, TA);
T20 = FNMS(KP342584725, T1i, T1h);
T21 = FNMS(KP634356270, T20, T1j);
TK = FMA(KP521108558, To, TA);
TL = FNMS(KP715370323, TK, Tr);
T1R = FNMS(KP342584725, T1j, T1g);
T1S = FNMS(KP634356270, T1R, T1h);
}
{
E T1I, T1l, T12, T1z, T1b;
T1I = FNMS(KP342584725, T1g, T1i);
T1J = FNMS(KP634356270, T1I, T1k);
T1l = FNMS(KP342584725, T1k, T1j);
T1m = FNMS(KP634356270, T1l, T1i);
T12 = FMA(KP521108558, Tu, To);
T13 = FMA(KP715370323, T12, Tx);
T1z = FNMS(KP342584725, T1h, T1k);
T1A = FNMS(KP634356270, T1z, T1g);
T1b = FNMS(KP521108558, Tx, Tr);
T1c = FNMS(KP715370323, T1b, To);
}
}
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
io[0] = T1f + T1g + T1h + T1i + T1j + T1k;
{
E Tk, TE, Tj, TD, Tl;
Tj = FNMS(KP778434453, Ti, T4);
Tk = FNMS(KP876768831, Tj, Tg);
TD = FMA(KP830830026, TC, Tr);
TE = FMA(KP918985947, TD, To);
Tl = FNMS(KP959492973, Tk, T1);
ro[WS(os, 10)] = FNMS(KP989821441, TE, Tl);
ro[WS(os, 1)] = FMA(KP989821441, TE, Tl);
}
{
E T23, T28, T22, T27, T24;
T22 = FNMS(KP778434453, T21, T1g);
T23 = FNMS(KP876768831, T22, T1k);
T27 = FMA(KP830830026, T26, T1t);
T28 = FMA(KP918985947, T27, T1s);
T24 = FNMS(KP959492973, T23, T1f);
io[WS(os, 1)] = FMA(KP989821441, T28, T24);
io[WS(os, 10)] = FNMS(KP989821441, T28, T24);
}
{
E T1U, T1Z, T1T, T1Y, T1V;
T1T = FNMS(KP778434453, T1S, T1k);
T1U = FNMS(KP876768831, T1T, T1i);
T1Y = FMA(KP830830026, T1X, T1u);
T1Z = FNMS(KP918985947, T1Y, T1r);
T1V = FNMS(KP959492973, T1U, T1f);
io[WS(os, 2)] = FNMS(KP989821441, T1Z, T1V);
io[WS(os, 9)] = FMA(KP989821441, T1Z, T1V);
}
{
E TI, TN, TH, TM, TJ;
TH = FNMS(KP778434453, TG, Tg);
TI = FNMS(KP876768831, TH, Ta);
TM = FMA(KP830830026, TL, Tx);
TN = FNMS(KP918985947, TM, Tu);
TJ = FNMS(KP959492973, TI, T1);
ro[WS(os, 2)] = FNMS(KP989821441, TN, TJ);
ro[WS(os, 9)] = FMA(KP989821441, TN, TJ);
}
{
E TR, TW, TQ, TV, TS;
TQ = FNMS(KP778434453, TP, Td);
TR = FNMS(KP876768831, TQ, T7);
TV = FNMS(KP830830026, TU, To);
TW = FNMS(KP918985947, TV, Tx);
TS = FNMS(KP959492973, TR, T1);
ro[WS(os, 8)] = FNMS(KP989821441, TW, TS);
ro[WS(os, 3)] = FMA(KP989821441, TW, TS);
}
{
E T1L, T1Q, T1K, T1P, T1M;
T1K = FNMS(KP778434453, T1J, T1j);
T1L = FNMS(KP876768831, T1K, T1h);
T1P = FNMS(KP830830026, T1O, T1s);
T1Q = FNMS(KP918985947, T1P, T1u);
T1M = FNMS(KP959492973, T1L, T1f);
io[WS(os, 3)] = FMA(KP989821441, T1Q, T1M);
io[WS(os, 8)] = FNMS(KP989821441, T1Q, T1M);
}
{
E T10, T15, TZ, T14, T11;
TZ = FNMS(KP778434453, TY, Ta);
T10 = FNMS(KP876768831, TZ, Td);
T14 = FNMS(KP830830026, T13, TA);
T15 = FMA(KP918985947, T14, Tr);
T11 = FNMS(KP959492973, T10, T1);
ro[WS(os, 4)] = FNMS(KP989821441, T15, T11);
ro[WS(os, 7)] = FMA(KP989821441, T15, T11);
}
{
E T1C, T1H, T1B, T1G, T1D;
T1B = FNMS(KP778434453, T1A, T1i);
T1C = FNMS(KP876768831, T1B, T1j);
T1G = FNMS(KP830830026, T1F, T1q);
T1H = FMA(KP918985947, T1G, T1t);
T1D = FNMS(KP959492973, T1C, T1f);
io[WS(os, 4)] = FNMS(KP989821441, T1H, T1D);
io[WS(os, 7)] = FMA(KP989821441, T1H, T1D);
}
{
E T1o, T1y, T1n, T1x, T1p;
T1n = FNMS(KP778434453, T1m, T1h);
T1o = FNMS(KP876768831, T1n, T1g);
T1x = FNMS(KP830830026, T1w, T1r);
T1y = FNMS(KP918985947, T1x, T1q);
T1p = FNMS(KP959492973, T1o, T1f);
io[WS(os, 5)] = FMA(KP989821441, T1y, T1p);
io[WS(os, 6)] = FNMS(KP989821441, T1y, T1p);
}
{
E T19, T1e, T18, T1d, T1a;
T18 = FNMS(KP778434453, T17, T7);
T19 = FNMS(KP876768831, T18, T4);
T1d = FNMS(KP830830026, T1c, Tu);
T1e = FNMS(KP918985947, T1d, TA);
T1a = FNMS(KP959492973, T19, T1);
ro[WS(os, 6)] = FNMS(KP989821441, T1e, T1a);
ro[WS(os, 5)] = FMA(KP989821441, T1e, T1a);
}
}
}
}
static const kdft_desc desc = { 11, "n1_11", { 30, 0, 110, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
/*
* This function contains 140 FP additions, 100 FP multiplications,
* (or, 60 additions, 20 multiplications, 80 fused multiply/add),
* 41 stack variables, 10 constants, and 44 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP654860733, +0.654860733945285064056925072466293553183791199);
DK(KP142314838, +0.142314838273285140443792668616369668791051361);
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
DK(KP415415013, +0.415415013001886425529274149229623203524004910);
DK(KP841253532, +0.841253532831181168861811648919367717513292498);
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
DK(KP909631995, +0.909631995354518371411715383079028460060241051);
DK(KP281732556, +0.281732556841429697711417915346616899035777899);
DK(KP540640817, +0.540640817455597582107635954318691695431770608);
DK(KP755749574, +0.755749574354258283774035843972344420179717445);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
E T1, TM, T4, TG, Tk, TR, Tw, TN, T7, TK, Ta, TH, Tn, TQ, Td;
E TJ, Tq, TO, Tt, TP, Tg, TI;
{
E T2, T3, Ti, Tj;
T1 = ri[0];
TM = ii[0];
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
TG = T3 - T2;
Ti = ii[WS(is, 1)];
Tj = ii[WS(is, 10)];
Tk = Ti - Tj;
TR = Ti + Tj;
{
E Tu, Tv, T5, T6;
Tu = ii[WS(is, 2)];
Tv = ii[WS(is, 9)];
Tw = Tu - Tv;
TN = Tu + Tv;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 9)];
T7 = T5 + T6;
TK = T6 - T5;
}
}
{
E T8, T9, To, Tp;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 8)];
Ta = T8 + T9;
TH = T9 - T8;
{
E Tl, Tm, Tb, Tc;
Tl = ii[WS(is, 3)];
Tm = ii[WS(is, 8)];
Tn = Tl - Tm;
TQ = Tl + Tm;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 7)];
Td = Tb + Tc;
TJ = Tc - Tb;
}
To = ii[WS(is, 4)];
Tp = ii[WS(is, 7)];
Tq = To - Tp;
TO = To + Tp;
{
E Tr, Ts, Te, Tf;
Tr = ii[WS(is, 5)];
Ts = ii[WS(is, 6)];
Tt = Tr - Ts;
TP = Tr + Ts;
Te = ri[WS(is, 5)];
Tf = ri[WS(is, 6)];
Tg = Te + Tf;
TI = Tf - Te;
}
}
{
E Tx, Th, TZ, T10;
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
io[0] = TM + TR + TN + TQ + TO + TP;
Tx = FMA(KP755749574, Tk, KP540640817 * Tn) + FNMS(KP909631995, Tt, KP281732556 * Tq) - (KP989821441 * Tw);
Th = FMA(KP841253532, Ta, T1) + FNMS(KP959492973, Td, KP415415013 * Tg) + FNMA(KP142314838, T7, KP654860733 * T4);
ro[WS(os, 7)] = Th - Tx;
ro[WS(os, 4)] = Th + Tx;
TZ = FMA(KP755749574, TG, KP540640817 * TH) + FNMS(KP909631995, TI, KP281732556 * TJ) - (KP989821441 * TK);
T10 = FMA(KP841253532, TQ, TM) + FNMS(KP959492973, TO, KP415415013 * TP) + FNMA(KP142314838, TN, KP654860733 * TR);
io[WS(os, 4)] = TZ + T10;
io[WS(os, 7)] = T10 - TZ;
{
E TX, TY, Tz, Ty;
TX = FMA(KP909631995, TG, KP755749574 * TK) + FNMA(KP540640817, TI, KP989821441 * TJ) - (KP281732556 * TH);
TY = FMA(KP415415013, TR, TM) + FNMS(KP142314838, TO, KP841253532 * TP) + FNMA(KP959492973, TQ, KP654860733 * TN);
io[WS(os, 2)] = TX + TY;
io[WS(os, 9)] = TY - TX;
Tz = FMA(KP909631995, Tk, KP755749574 * Tw) + FNMA(KP540640817, Tt, KP989821441 * Tq) - (KP281732556 * Tn);
Ty = FMA(KP415415013, T4, T1) + FNMS(KP142314838, Td, KP841253532 * Tg) + FNMA(KP959492973, Ta, KP654860733 * T7);
ro[WS(os, 9)] = Ty - Tz;
ro[WS(os, 2)] = Ty + Tz;
}
}
{
E TB, TA, TT, TU;
TB = FMA(KP540640817, Tk, KP909631995 * Tw) + FMA(KP989821441, Tn, KP755749574 * Tq) + (KP281732556 * Tt);
TA = FMA(KP841253532, T4, T1) + FNMS(KP959492973, Tg, KP415415013 * T7) + FNMA(KP654860733, Td, KP142314838 * Ta);
ro[WS(os, 10)] = TA - TB;
ro[WS(os, 1)] = TA + TB;
{
E TV, TW, TD, TC;
TV = FMA(KP540640817, TG, KP909631995 * TK) + FMA(KP989821441, TH, KP755749574 * TJ) + (KP281732556 * TI);
TW = FMA(KP841253532, TR, TM) + FNMS(KP959492973, TP, KP415415013 * TN) + FNMA(KP654860733, TO, KP142314838 * TQ);
io[WS(os, 1)] = TV + TW;
io[WS(os, 10)] = TW - TV;
TD = FMA(KP989821441, Tk, KP540640817 * Tq) + FNMS(KP909631995, Tn, KP755749574 * Tt) - (KP281732556 * Tw);
TC = FMA(KP415415013, Ta, T1) + FNMS(KP654860733, Tg, KP841253532 * Td) + FNMA(KP959492973, T7, KP142314838 * T4);
ro[WS(os, 8)] = TC - TD;
ro[WS(os, 3)] = TC + TD;
}
TT = FMA(KP989821441, TG, KP540640817 * TJ) + FNMS(KP909631995, TH, KP755749574 * TI) - (KP281732556 * TK);
TU = FMA(KP415415013, TQ, TM) + FNMS(KP654860733, TP, KP841253532 * TO) + FNMA(KP959492973, TN, KP142314838 * TR);
io[WS(os, 3)] = TT + TU;
io[WS(os, 8)] = TU - TT;
{
E TL, TS, TF, TE;
TL = FMA(KP281732556, TG, KP755749574 * TH) + FNMS(KP909631995, TJ, KP989821441 * TI) - (KP540640817 * TK);
TS = FMA(KP841253532, TN, TM) + FNMS(KP142314838, TP, KP415415013 * TO) + FNMA(KP654860733, TQ, KP959492973 * TR);
io[WS(os, 5)] = TL + TS;
io[WS(os, 6)] = TS - TL;
TF = FMA(KP281732556, Tk, KP755749574 * Tn) + FNMS(KP909631995, Tq, KP989821441 * Tt) - (KP540640817 * Tw);
TE = FMA(KP841253532, T7, T1) + FNMS(KP142314838, Tg, KP415415013 * Td) + FNMA(KP654860733, Ta, KP959492973 * T4);
ro[WS(os, 6)] = TE - TF;
ro[WS(os, 5)] = TE + TF;
}
}
}
}
}
static const kdft_desc desc = { 11, "n1_11", { 60, 20, 80, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
}
#endif
+420
View File
@@ -0,0 +1,420 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
/*
* This function contains 96 FP additions, 24 FP multiplications,
* (or, 72 additions, 0 multiplications, 24 fused multiply/add),
* 43 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1d, TG;
E TJ, T1u, T1c, Tl, T1i, TL, TO, T1v, T1h;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = ri[WS(is, 8)];
T4 = T2 + T3;
T5 = T1 + T4;
TR = FNMS(KP500000000, T4, T1);
TA = T3 - T2;
}
{
E To, Tp, Tq, Tr;
To = ii[0];
Tp = ii[WS(is, 4)];
Tq = ii[WS(is, 8)];
Tr = Tp + Tq;
Ts = To + Tr;
TS = Tp - Tq;
Tz = FNMS(KP500000000, Tr, To);
}
{
E T6, T7, T8, T9;
T6 = ri[WS(is, 6)];
T7 = ri[WS(is, 10)];
T8 = ri[WS(is, 2)];
T9 = T7 + T8;
Ta = T6 + T9;
TU = FNMS(KP500000000, T9, T6);
TD = T8 - T7;
}
{
E Tt, Tu, Tv, Tw;
Tt = ii[WS(is, 6)];
Tu = ii[WS(is, 10)];
Tv = ii[WS(is, 2)];
Tw = Tu + Tv;
Tx = Tt + Tw;
TV = Tu - Tv;
TC = FNMS(KP500000000, Tw, Tt);
}
{
E Tc, Td, Te, Tf;
Tc = ri[WS(is, 3)];
Td = ri[WS(is, 7)];
Te = ri[WS(is, 11)];
Tf = Td + Te;
Tg = Tc + Tf;
T1d = Te - Td;
TG = FNMS(KP500000000, Tf, Tc);
}
{
E T1a, TH, TI, T1b;
T1a = ii[WS(is, 3)];
TH = ii[WS(is, 7)];
TI = ii[WS(is, 11)];
T1b = TH + TI;
TJ = TH - TI;
T1u = T1a + T1b;
T1c = FNMS(KP500000000, T1b, T1a);
}
{
E Th, Ti, Tj, Tk;
Th = ri[WS(is, 9)];
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 5)];
Tk = Ti + Tj;
Tl = Th + Tk;
T1i = Tj - Ti;
TL = FNMS(KP500000000, Tk, Th);
}
{
E T1f, TM, TN, T1g;
T1f = ii[WS(is, 9)];
TM = ii[WS(is, 1)];
TN = ii[WS(is, 5)];
T1g = TM + TN;
TO = TM - TN;
T1v = T1f + T1g;
T1h = FNMS(KP500000000, T1g, T1f);
}
{
E Tb, Tm, T1t, T1w;
Tb = T5 + Ta;
Tm = Tg + Tl;
ro[WS(os, 6)] = Tb - Tm;
ro[0] = Tb + Tm;
{
E T1x, T1y, Tn, Ty;
T1x = Ts + Tx;
T1y = T1u + T1v;
io[WS(os, 6)] = T1x - T1y;
io[0] = T1x + T1y;
Tn = Tg - Tl;
Ty = Ts - Tx;
io[WS(os, 3)] = Tn + Ty;
io[WS(os, 9)] = Ty - Tn;
}
T1t = T5 - Ta;
T1w = T1u - T1v;
ro[WS(os, 3)] = T1t - T1w;
ro[WS(os, 9)] = T1t + T1w;
{
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
{
E TZ, T10, T1e, T1j;
TZ = FMA(KP866025403, TA, Tz);
T10 = FMA(KP866025403, TD, TC);
T11 = TZ - T10;
T1l = TZ + T10;
T1e = FMA(KP866025403, T1d, T1c);
T1j = FMA(KP866025403, T1i, T1h);
T1k = T1e - T1j;
T1m = T1e + T1j;
}
{
E T12, T13, T15, T16;
T12 = FMA(KP866025403, TJ, TG);
T13 = FMA(KP866025403, TO, TL);
T14 = T12 - T13;
T18 = T12 + T13;
T15 = FMA(KP866025403, TS, TR);
T16 = FMA(KP866025403, TV, TU);
T17 = T15 + T16;
T19 = T15 - T16;
}
io[WS(os, 1)] = T11 - T14;
ro[WS(os, 1)] = T19 + T1k;
io[WS(os, 7)] = T11 + T14;
ro[WS(os, 7)] = T19 - T1k;
ro[WS(os, 10)] = T17 - T18;
io[WS(os, 10)] = T1l - T1m;
ro[WS(os, 4)] = T17 + T18;
io[WS(os, 4)] = T1l + T1m;
}
{
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
{
E TB, TE, T1o, T1p;
TB = FNMS(KP866025403, TA, Tz);
TE = FNMS(KP866025403, TD, TC);
TF = TB - TE;
T1r = TB + TE;
T1o = FNMS(KP866025403, T1d, T1c);
T1p = FNMS(KP866025403, T1i, T1h);
T1q = T1o - T1p;
T1s = T1o + T1p;
}
{
E TK, TP, TT, TW;
TK = FNMS(KP866025403, TJ, TG);
TP = FNMS(KP866025403, TO, TL);
TQ = TK - TP;
TY = TK + TP;
TT = FNMS(KP866025403, TS, TR);
TW = FNMS(KP866025403, TV, TU);
TX = TT + TW;
T1n = TT - TW;
}
io[WS(os, 5)] = TF - TQ;
ro[WS(os, 5)] = T1n + T1q;
io[WS(os, 11)] = TF + TQ;
ro[WS(os, 11)] = T1n - T1q;
ro[WS(os, 2)] = TX - TY;
io[WS(os, 2)] = T1r - T1s;
ro[WS(os, 8)] = TX + TY;
io[WS(os, 8)] = T1r + T1s;
}
}
}
}
}
static const kdft_desc desc = { 12, "n1_12", { 72, 0, 24, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
/*
* This function contains 96 FP additions, 16 FP multiplications,
* (or, 88 additions, 8 multiplications, 8 fused multiply/add),
* 43 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = ri[WS(is, 8)];
T4 = T2 + T3;
T5 = T1 + T4;
TR = FNMS(KP500000000, T4, T1);
TA = KP866025403 * (T3 - T2);
}
{
E To, Tp, Tq, Tr;
To = ii[0];
Tp = ii[WS(is, 4)];
Tq = ii[WS(is, 8)];
Tr = Tp + Tq;
Ts = To + Tr;
TS = KP866025403 * (Tp - Tq);
Tz = FNMS(KP500000000, Tr, To);
}
{
E T6, T7, T8, T9;
T6 = ri[WS(is, 6)];
T7 = ri[WS(is, 10)];
T8 = ri[WS(is, 2)];
T9 = T7 + T8;
Ta = T6 + T9;
TU = FNMS(KP500000000, T9, T6);
TD = KP866025403 * (T8 - T7);
}
{
E Tt, Tu, Tv, Tw;
Tt = ii[WS(is, 6)];
Tu = ii[WS(is, 10)];
Tv = ii[WS(is, 2)];
Tw = Tu + Tv;
Tx = Tt + Tw;
TV = KP866025403 * (Tu - Tv);
TC = FNMS(KP500000000, Tw, Tt);
}
{
E Tc, Td, Te, Tf;
Tc = ri[WS(is, 3)];
Td = ri[WS(is, 7)];
Te = ri[WS(is, 11)];
Tf = Td + Te;
Tg = Tc + Tf;
T1a = KP866025403 * (Te - Td);
TG = FNMS(KP500000000, Tf, Tc);
}
{
E T1b, TH, TI, T1c;
T1b = ii[WS(is, 3)];
TH = ii[WS(is, 7)];
TI = ii[WS(is, 11)];
T1c = TH + TI;
TJ = KP866025403 * (TH - TI);
T1u = T1b + T1c;
T1d = FNMS(KP500000000, T1c, T1b);
}
{
E Th, Ti, Tj, Tk;
Th = ri[WS(is, 9)];
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 5)];
Tk = Ti + Tj;
Tl = Th + Tk;
T1f = KP866025403 * (Tj - Ti);
TL = FNMS(KP500000000, Tk, Th);
}
{
E T1g, TM, TN, T1h;
T1g = ii[WS(is, 9)];
TM = ii[WS(is, 1)];
TN = ii[WS(is, 5)];
T1h = TM + TN;
TO = KP866025403 * (TM - TN);
T1v = T1g + T1h;
T1i = FNMS(KP500000000, T1h, T1g);
}
{
E Tb, Tm, T1t, T1w;
Tb = T5 + Ta;
Tm = Tg + Tl;
ro[WS(os, 6)] = Tb - Tm;
ro[0] = Tb + Tm;
{
E T1x, T1y, Tn, Ty;
T1x = Ts + Tx;
T1y = T1u + T1v;
io[WS(os, 6)] = T1x - T1y;
io[0] = T1x + T1y;
Tn = Tg - Tl;
Ty = Ts - Tx;
io[WS(os, 3)] = Tn + Ty;
io[WS(os, 9)] = Ty - Tn;
}
T1t = T5 - Ta;
T1w = T1u - T1v;
ro[WS(os, 3)] = T1t - T1w;
ro[WS(os, 9)] = T1t + T1w;
{
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
{
E TZ, T10, T1e, T1j;
TZ = TA + Tz;
T10 = TD + TC;
T11 = TZ - T10;
T1l = TZ + T10;
T1e = T1a + T1d;
T1j = T1f + T1i;
T1k = T1e - T1j;
T1m = T1e + T1j;
}
{
E T12, T13, T15, T16;
T12 = TG + TJ;
T13 = TL + TO;
T14 = T12 - T13;
T18 = T12 + T13;
T15 = TR + TS;
T16 = TU + TV;
T17 = T15 + T16;
T19 = T15 - T16;
}
io[WS(os, 1)] = T11 - T14;
ro[WS(os, 1)] = T19 + T1k;
io[WS(os, 7)] = T11 + T14;
ro[WS(os, 7)] = T19 - T1k;
ro[WS(os, 10)] = T17 - T18;
io[WS(os, 10)] = T1l - T1m;
ro[WS(os, 4)] = T17 + T18;
io[WS(os, 4)] = T1l + T1m;
}
{
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
{
E TB, TE, T1o, T1p;
TB = Tz - TA;
TE = TC - TD;
TF = TB - TE;
T1r = TB + TE;
T1o = T1d - T1a;
T1p = T1i - T1f;
T1q = T1o - T1p;
T1s = T1o + T1p;
}
{
E TK, TP, TT, TW;
TK = TG - TJ;
TP = TL - TO;
TQ = TK - TP;
TY = TK + TP;
TT = TR - TS;
TW = TU - TV;
TX = TT + TW;
T1n = TT - TW;
}
io[WS(os, 5)] = TF - TQ;
ro[WS(os, 5)] = T1n + T1q;
io[WS(os, 11)] = TF + TQ;
ro[WS(os, 11)] = T1n - T1q;
ro[WS(os, 2)] = TX - TY;
io[WS(os, 2)] = T1r - T1s;
ro[WS(os, 8)] = TX + TY;
io[WS(os, 8)] = T1r + T1s;
}
}
}
}
}
static const kdft_desc desc = { 12, "n1_12", { 88, 8, 8, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
}
#endif
+681
View File
@@ -0,0 +1,681 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
/*
* This function contains 176 FP additions, 114 FP multiplications,
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
* 76 stack variables, 25 constants, and 52 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP875502302, +0.875502302409147941146295545768755143177842006);
DK(KP520028571, +0.520028571888864619117130500499232802493238139);
DK(KP968287244, +0.968287244361984016049539446938120421179794516);
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
DK(KP600477271, +0.600477271932665282925769253334763009352012849);
DK(KP957805992, +0.957805992594665126462521754605754580515587217);
DK(KP516520780, +0.516520780623489722840901288569017135705033622);
DK(KP581704778, +0.581704778510515730456870384989698884939833902);
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
DK(KP301479260, +0.301479260047709873958013540496673347309208464);
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
DK(KP859542535, +0.859542535098774820163672132761689612766401925);
DK(KP514918778, +0.514918778086315755491789696138117261566051239);
DK(KP522026385, +0.522026385161275033714027226654165028300441940);
DK(KP853480001, +0.853480001859823990758994934970528322872359049);
DK(KP612264650, +0.612264650376756543746494474777125408779395514);
DK(KP038632954, +0.038632954644348171955506895830342264440241080);
DK(KP302775637, +0.302775637731994646559610633735247973125648287);
DK(KP769338817, +0.769338817572980603471413688209101117038278899);
DK(KP686558370, +0.686558370781754340655719594850823015421401653);
DK(KP226109445, +0.226109445035782405468510155372505010481906348);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
E T1, T1P, T2n, T2o, To, TH, T2h, T2k, TB, TE, Tw, TF, T2c, T2j, T1j;
E T1m, T12, T1f, T21, T24, T1U, T27, T1d, T1g, T1Y, T25;
T1 = ri[0];
T1P = ii[0];
{
E Tf, T2d, Tb, Ty, Tq, T6, Tx, Tr, Ti, Tt, Tl, Tu, Tm, T2e, Td;
E Te, Tc, Tn;
Td = ri[WS(is, 8)];
Te = ri[WS(is, 5)];
Tf = Td + Te;
T2d = Td - Te;
{
E T7, T8, T9, Ta;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 10)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
Tb = T7 + Ta;
Ty = FMS(KP500000000, Ta, T7);
Tq = T8 - T9;
}
{
E T2, T3, T4, T5;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 3)];
T4 = ri[WS(is, 9)];
T5 = T3 + T4;
T6 = T2 + T5;
Tx = FNMS(KP500000000, T5, T2);
Tr = T4 - T3;
}
{
E Tg, Th, Tj, Tk;
Tg = ri[WS(is, 11)];
Th = ri[WS(is, 6)];
Ti = Tg + Th;
Tt = Tg - Th;
Tj = ri[WS(is, 7)];
Tk = ri[WS(is, 2)];
Tl = Tj + Tk;
Tu = Tj - Tk;
}
Tm = Ti + Tl;
T2e = Tt + Tu;
T2n = T6 - Tb;
T2o = T2d + T2e;
Tc = T6 + Tb;
Tn = Tf + Tm;
To = Tc + Tn;
TH = Tc - Tn;
{
E T2f, T2g, Tz, TA;
T2f = FNMS(KP500000000, T2e, T2d);
T2g = Tr + Tq;
T2h = FMA(KP866025403, T2g, T2f);
T2k = FNMS(KP866025403, T2g, T2f);
Tz = Tx - Ty;
TA = FNMS(KP500000000, Tm, Tf);
TB = Tz + TA;
TE = Tz - TA;
}
{
E Ts, Tv, T2a, T2b;
Ts = Tq - Tr;
Tv = Tt - Tu;
Tw = Ts + Tv;
TF = Ts - Tv;
T2a = Tx + Ty;
T2b = Ti - Tl;
T2c = FMA(KP866025403, T2b, T2a);
T2j = FNMS(KP866025403, T2b, T2a);
}
}
{
E TM, T1R, T10, T1l, T18, TX, T1k, T15, TP, T1a, TS, T1b, TT, T1S, TK;
E TL, TU, T11;
TK = ii[WS(is, 8)];
TL = ii[WS(is, 5)];
TM = TK - TL;
T1R = TK + TL;
{
E T16, TY, TZ, T17;
T16 = ii[WS(is, 12)];
TY = ii[WS(is, 10)];
TZ = ii[WS(is, 4)];
T17 = TY + TZ;
T10 = TY - TZ;
T1l = T16 + T17;
T18 = FMS(KP500000000, T17, T16);
}
{
E T13, TV, TW, T14;
T13 = ii[WS(is, 1)];
TV = ii[WS(is, 9)];
TW = ii[WS(is, 3)];
T14 = TW + TV;
TX = TV - TW;
T1k = T13 + T14;
T15 = FNMS(KP500000000, T14, T13);
}
{
E TN, TO, TQ, TR;
TN = ii[WS(is, 11)];
TO = ii[WS(is, 6)];
TP = TN - TO;
T1a = TN + TO;
TQ = ii[WS(is, 7)];
TR = ii[WS(is, 2)];
TS = TQ - TR;
T1b = TQ + TR;
}
TT = TP + TS;
T1S = T1a + T1b;
T1j = TM + TT;
T1m = T1k - T1l;
TU = FNMS(KP500000000, TT, TM);
T11 = TX + T10;
T12 = FMA(KP866025403, T11, TU);
T1f = FNMS(KP866025403, T11, TU);
{
E T1Z, T20, T1Q, T1T;
T1Z = T15 - T18;
T20 = FNMS(KP500000000, T1S, T1R);
T21 = T1Z + T20;
T24 = T1Z - T20;
T1Q = T1k + T1l;
T1T = T1R + T1S;
T1U = T1Q + T1T;
T27 = T1Q - T1T;
}
{
E T19, T1c, T1W, T1X;
T19 = T15 + T18;
T1c = T1a - T1b;
T1d = FMA(KP866025403, T1c, T19);
T1g = FNMS(KP866025403, T1c, T19);
T1W = T10 - TX;
T1X = TP - TS;
T1Y = T1W + T1X;
T25 = T1W - T1X;
}
}
ro[0] = T1 + To;
io[0] = T1P + T1U;
{
E T1z, T1J, T1G, T1H, T1w, T1I, T1n, T1i, T1s, T1E, TD, T1D, TI, T1r, T1e;
E T1h;
{
E T1x, T1y, T1u, T1v;
T1x = FNMS(KP226109445, Tw, TB);
T1y = FMA(KP686558370, TE, TF);
T1z = FNMS(KP769338817, T1y, T1x);
T1J = FMA(KP769338817, T1y, T1x);
T1G = FMA(KP302775637, T1j, T1m);
T1u = FNMS(KP038632954, T12, T1d);
T1v = FNMS(KP612264650, T1f, T1g);
T1H = FNMS(KP853480001, T1v, T1u);
T1w = FMA(KP853480001, T1v, T1u);
T1I = FNMS(KP522026385, T1H, T1G);
}
T1n = FNMS(KP302775637, T1m, T1j);
T1e = FMA(KP038632954, T1d, T12);
T1h = FMA(KP612264650, T1g, T1f);
T1i = FNMS(KP853480001, T1h, T1e);
T1s = FNMS(KP522026385, T1i, T1n);
T1E = FMA(KP853480001, T1h, T1e);
{
E TG, T1q, Tp, TC, T1p;
TG = FNMS(KP514918778, TF, TE);
T1q = FNMS(KP859542535, TG, TH);
Tp = FNMS(KP083333333, To, T1);
TC = FMA(KP301479260, TB, Tw);
T1p = FNMS(KP251768516, TC, Tp);
TD = FMA(KP503537032, TC, Tp);
T1D = FNMS(KP300462606, T1q, T1p);
TI = FMA(KP581704778, TH, TG);
T1r = FMA(KP300462606, T1q, T1p);
}
{
E TJ, T1o, T1L, T1M;
TJ = FMA(KP516520780, TI, TD);
T1o = FMA(KP957805992, T1n, T1i);
ro[WS(os, 1)] = FNMS(KP600477271, T1o, TJ);
ro[WS(os, 12)] = FMA(KP600477271, T1o, TJ);
{
E T1t, T1A, T1N, T1O;
T1t = FNMS(KP575140729, T1s, T1r);
T1A = FMA(KP968287244, T1z, T1w);
ro[WS(os, 9)] = FNMS(KP520028571, T1A, T1t);
ro[WS(os, 3)] = FMA(KP520028571, T1A, T1t);
T1N = FNMS(KP516520780, TI, TD);
T1O = FMA(KP957805992, T1G, T1H);
ro[WS(os, 8)] = FNMS(KP600477271, T1O, T1N);
ro[WS(os, 5)] = FMA(KP600477271, T1O, T1N);
}
T1L = FNMS(KP520028571, T1E, T1D);
T1M = FNMS(KP875502302, T1J, T1I);
ro[WS(os, 11)] = FNMS(KP575140729, T1M, T1L);
ro[WS(os, 6)] = FMA(KP575140729, T1M, T1L);
{
E T1F, T1K, T1B, T1C;
T1F = FMA(KP520028571, T1E, T1D);
T1K = FMA(KP875502302, T1J, T1I);
ro[WS(os, 7)] = FNMS(KP575140729, T1K, T1F);
ro[WS(os, 2)] = FMA(KP575140729, T1K, T1F);
T1B = FMA(KP575140729, T1s, T1r);
T1C = FNMS(KP968287244, T1z, T1w);
ro[WS(os, 10)] = FNMS(KP520028571, T1C, T1B);
ro[WS(os, 4)] = FMA(KP520028571, T1C, T1B);
}
}
}
{
E T2F, T2N, T2v, T2u, T2A, T2K, T2p, T2m, T2C, T2M, T23, T2J, T28, T2z, T2i;
E T2l;
{
E T2D, T2E, T2s, T2t;
T2D = FNMS(KP226109445, T1Y, T21);
T2E = FMA(KP686558370, T24, T25);
T2F = FNMS(KP769338817, T2E, T2D);
T2N = FMA(KP769338817, T2E, T2D);
T2v = FNMS(KP302775637, T2n, T2o);
T2s = FMA(KP038632954, T2c, T2h);
T2t = FMA(KP612264650, T2j, T2k);
T2u = FNMS(KP853480001, T2t, T2s);
T2A = FNMS(KP522026385, T2u, T2v);
T2K = FMA(KP853480001, T2t, T2s);
}
T2p = FMA(KP302775637, T2o, T2n);
T2i = FNMS(KP038632954, T2h, T2c);
T2l = FNMS(KP612264650, T2k, T2j);
T2m = FNMS(KP853480001, T2l, T2i);
T2C = FMA(KP853480001, T2l, T2i);
T2M = FNMS(KP522026385, T2m, T2p);
{
E T26, T2y, T1V, T22, T2x;
T26 = FNMS(KP514918778, T25, T24);
T2y = FNMS(KP859542535, T26, T27);
T1V = FNMS(KP083333333, T1U, T1P);
T22 = FMA(KP301479260, T21, T1Y);
T2x = FNMS(KP251768516, T22, T1V);
T23 = FMA(KP503537032, T22, T1V);
T2J = FNMS(KP300462606, T2y, T2x);
T28 = FMA(KP581704778, T27, T26);
T2z = FMA(KP300462606, T2y, T2x);
}
{
E T29, T2q, T2L, T2O;
T29 = FNMS(KP516520780, T28, T23);
T2q = FMA(KP957805992, T2p, T2m);
io[WS(os, 5)] = FNMS(KP600477271, T2q, T29);
io[WS(os, 8)] = FMA(KP600477271, T2q, T29);
{
E T2r, T2w, T2P, T2Q;
T2r = FMA(KP516520780, T28, T23);
T2w = FMA(KP957805992, T2v, T2u);
io[WS(os, 1)] = FMA(KP600477271, T2w, T2r);
io[WS(os, 12)] = FNMS(KP600477271, T2w, T2r);
T2P = FMA(KP520028571, T2K, T2J);
T2Q = FMA(KP875502302, T2N, T2M);
io[WS(os, 6)] = FNMS(KP575140729, T2Q, T2P);
io[WS(os, 11)] = FMA(KP575140729, T2Q, T2P);
}
T2L = FNMS(KP520028571, T2K, T2J);
T2O = FNMS(KP875502302, T2N, T2M);
io[WS(os, 2)] = FNMS(KP575140729, T2O, T2L);
io[WS(os, 7)] = FMA(KP575140729, T2O, T2L);
{
E T2H, T2I, T2B, T2G;
T2H = FNMS(KP575140729, T2A, T2z);
T2I = FMA(KP968287244, T2F, T2C);
io[WS(os, 4)] = FNMS(KP520028571, T2I, T2H);
io[WS(os, 10)] = FMA(KP520028571, T2I, T2H);
T2B = FMA(KP575140729, T2A, T2z);
T2G = FNMS(KP968287244, T2F, T2C);
io[WS(os, 3)] = FNMS(KP520028571, T2G, T2B);
io[WS(os, 9)] = FMA(KP520028571, T2G, T2B);
}
}
}
}
}
}
static const kdft_desc desc = { 13, "n1_13", { 62, 0, 114, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
/*
* This function contains 176 FP additions, 68 FP multiplications,
* (or, 138 additions, 30 multiplications, 38 fused multiply/add),
* 71 stack variables, 20 constants, and 52 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
DK(KP075902986, +0.075902986037193865983102897245103540356428373);
DK(KP132983124, +0.132983124607418643793760531921092974399165133);
DK(KP258260390, +0.258260390311744861420450644284508567852516811);
DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
DK(KP300238635, +0.300238635966332641462884626667381504676006424);
DK(KP011599105, +0.011599105605768290721655456654083252189827041);
DK(KP156891391, +0.156891391051584611046832726756003269660212636);
DK(KP256247671, +0.256247671582936600958684654061725059144125175);
DK(KP174138601, +0.174138601152135905005660794929264742616964676);
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
DK(KP113854479, +0.113854479055790798974654345867655310534642560);
DK(KP265966249, +0.265966249214837287587521063842185948798330267);
DK(KP387390585, +0.387390585467617292130675966426762851778775217);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
E T1, T1q, Tt, Tu, To, T22, T20, T24, TF, TH, TA, TI, T1X, T25, T2a;
E T2d, T18, T1n, T2k, T2n, T1l, T1r, T1f, T1o, T2h, T2m;
T1 = ri[0];
T1q = ii[0];
{
E Tf, Tp, Tb, TC, Tx, T6, TB, Tw, Ti, Tq, Tl, Tr, Tm, Ts, Td;
E Te, Tc, Tn;
Td = ri[WS(is, 8)];
Te = ri[WS(is, 5)];
Tf = Td + Te;
Tp = Td - Te;
{
E T7, T8, T9, Ta;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 10)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
Tb = T7 + Ta;
TC = T8 - T9;
Tx = FNMS(KP500000000, Ta, T7);
}
{
E T2, T3, T4, T5;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 3)];
T4 = ri[WS(is, 9)];
T5 = T3 + T4;
T6 = T2 + T5;
TB = T3 - T4;
Tw = FNMS(KP500000000, T5, T2);
}
{
E Tg, Th, Tj, Tk;
Tg = ri[WS(is, 11)];
Th = ri[WS(is, 6)];
Ti = Tg + Th;
Tq = Tg - Th;
Tj = ri[WS(is, 7)];
Tk = ri[WS(is, 2)];
Tl = Tj + Tk;
Tr = Tj - Tk;
}
Tm = Ti + Tl;
Ts = Tq + Tr;
Tt = Tp + Ts;
Tu = T6 - Tb;
Tc = T6 + Tb;
Tn = Tf + Tm;
To = Tc + Tn;
T22 = KP300462606 * (Tc - Tn);
{
E T1Y, T1Z, TD, TE;
T1Y = TB + TC;
T1Z = Tq - Tr;
T20 = T1Y - T1Z;
T24 = T1Y + T1Z;
TD = KP866025403 * (TB - TC);
TE = FNMS(KP500000000, Ts, Tp);
TF = TD - TE;
TH = TD + TE;
}
{
E Ty, Tz, T1V, T1W;
Ty = Tw - Tx;
Tz = KP866025403 * (Ti - Tl);
TA = Ty + Tz;
TI = Ty - Tz;
T1V = Tw + Tx;
T1W = FNMS(KP500000000, Tm, Tf);
T1X = T1V - T1W;
T25 = T1V + T1W;
}
}
{
E TZ, T2b, TV, T1i, T1a, TQ, T1h, T19, T12, T1d, T15, T1c, T16, T2c, TX;
E TY, TW, T17;
TX = ii[WS(is, 8)];
TY = ii[WS(is, 5)];
TZ = TX + TY;
T2b = TX - TY;
{
E TR, TS, TT, TU;
TR = ii[WS(is, 12)];
TS = ii[WS(is, 10)];
TT = ii[WS(is, 4)];
TU = TS + TT;
TV = FNMS(KP500000000, TU, TR);
T1i = TR + TU;
T1a = TS - TT;
}
{
E TM, TN, TO, TP;
TM = ii[WS(is, 1)];
TN = ii[WS(is, 3)];
TO = ii[WS(is, 9)];
TP = TN + TO;
TQ = FNMS(KP500000000, TP, TM);
T1h = TM + TP;
T19 = TN - TO;
}
{
E T10, T11, T13, T14;
T10 = ii[WS(is, 11)];
T11 = ii[WS(is, 6)];
T12 = T10 + T11;
T1d = T10 - T11;
T13 = ii[WS(is, 7)];
T14 = ii[WS(is, 2)];
T15 = T13 + T14;
T1c = T13 - T14;
}
T16 = T12 + T15;
T2c = T1d + T1c;
T2a = T1h - T1i;
T2d = T2b + T2c;
TW = TQ + TV;
T17 = FNMS(KP500000000, T16, TZ);
T18 = TW - T17;
T1n = TW + T17;
{
E T2i, T2j, T1j, T1k;
T2i = TQ - TV;
T2j = KP866025403 * (T15 - T12);
T2k = T2i + T2j;
T2n = T2i - T2j;
T1j = T1h + T1i;
T1k = TZ + T16;
T1l = KP300462606 * (T1j - T1k);
T1r = T1j + T1k;
}
{
E T1b, T1e, T2f, T2g;
T1b = T19 + T1a;
T1e = T1c - T1d;
T1f = T1b + T1e;
T1o = T1e - T1b;
T2f = FNMS(KP500000000, T2c, T2b);
T2g = KP866025403 * (T1a - T19);
T2h = T2f - T2g;
T2m = T2g + T2f;
}
}
ro[0] = T1 + To;
io[0] = T1q + T1r;
{
E T1D, T1N, T1y, T1x, T1E, T1O, Tv, TK, T1J, T1Q, T1m, T1R, T1t, T1I, TG;
E TJ;
{
E T1B, T1C, T1v, T1w;
T1B = FMA(KP387390585, T1f, KP265966249 * T18);
T1C = FMA(KP113854479, T1o, KP503537032 * T1n);
T1D = T1B + T1C;
T1N = T1C - T1B;
T1y = FMA(KP575140729, Tu, KP174138601 * Tt);
T1v = FNMS(KP156891391, TH, KP256247671 * TI);
T1w = FMA(KP011599105, TF, KP300238635 * TA);
T1x = T1v - T1w;
T1E = T1y + T1x;
T1O = KP1_732050807 * (T1v + T1w);
}
Tv = FNMS(KP174138601, Tu, KP575140729 * Tt);
TG = FNMS(KP300238635, TF, KP011599105 * TA);
TJ = FMA(KP256247671, TH, KP156891391 * TI);
TK = TG - TJ;
T1J = KP1_732050807 * (TJ + TG);
T1Q = Tv - TK;
{
E T1g, T1H, T1p, T1s, T1G;
T1g = FNMS(KP132983124, T1f, KP258260390 * T18);
T1H = T1l - T1g;
T1p = FNMS(KP251768516, T1o, KP075902986 * T1n);
T1s = FNMS(KP083333333, T1r, T1q);
T1G = T1s - T1p;
T1m = FMA(KP2_000000000, T1g, T1l);
T1R = T1H + T1G;
T1t = FMA(KP2_000000000, T1p, T1s);
T1I = T1G - T1H;
}
{
E TL, T1u, T1P, T1S;
TL = FMA(KP2_000000000, TK, Tv);
T1u = T1m + T1t;
io[WS(os, 1)] = TL + T1u;
io[WS(os, 12)] = T1u - TL;
{
E T1z, T1A, T1T, T1U;
T1z = FMS(KP2_000000000, T1x, T1y);
T1A = T1t - T1m;
io[WS(os, 5)] = T1z + T1A;
io[WS(os, 8)] = T1A - T1z;
T1T = T1R - T1Q;
T1U = T1O + T1N;
io[WS(os, 4)] = T1T - T1U;
io[WS(os, 10)] = T1U + T1T;
}
T1P = T1N - T1O;
T1S = T1Q + T1R;
io[WS(os, 3)] = T1P + T1S;
io[WS(os, 9)] = T1S - T1P;
{
E T1L, T1M, T1F, T1K;
T1L = T1J + T1I;
T1M = T1E + T1D;
io[WS(os, 6)] = T1L - T1M;
io[WS(os, 11)] = T1M + T1L;
T1F = T1D - T1E;
T1K = T1I - T1J;
io[WS(os, 2)] = T1F + T1K;
io[WS(os, 7)] = T1K - T1F;
}
}
}
{
E T2y, T2I, T2J, T2K, T2B, T2L, T2e, T2p, T2u, T2G, T23, T2F, T28, T2t, T2l;
E T2o;
{
E T2w, T2x, T2z, T2A;
T2w = FMA(KP387390585, T20, KP265966249 * T1X);
T2x = FNMS(KP503537032, T25, KP113854479 * T24);
T2y = T2w + T2x;
T2I = T2w - T2x;
T2J = FMA(KP575140729, T2a, KP174138601 * T2d);
T2z = FNMS(KP300238635, T2n, KP011599105 * T2m);
T2A = FNMS(KP156891391, T2h, KP256247671 * T2k);
T2K = T2z + T2A;
T2B = KP1_732050807 * (T2z - T2A);
T2L = T2J + T2K;
}
T2e = FNMS(KP575140729, T2d, KP174138601 * T2a);
T2l = FMA(KP256247671, T2h, KP156891391 * T2k);
T2o = FMA(KP300238635, T2m, KP011599105 * T2n);
T2p = T2l - T2o;
T2u = T2e - T2p;
T2G = KP1_732050807 * (T2o + T2l);
{
E T21, T2r, T26, T27, T2s;
T21 = FNMS(KP132983124, T20, KP258260390 * T1X);
T2r = T22 - T21;
T26 = FMA(KP251768516, T24, KP075902986 * T25);
T27 = FNMS(KP083333333, To, T1);
T2s = T27 - T26;
T23 = FMA(KP2_000000000, T21, T22);
T2F = T2s - T2r;
T28 = FMA(KP2_000000000, T26, T27);
T2t = T2r + T2s;
}
{
E T29, T2q, T2N, T2O;
T29 = T23 + T28;
T2q = FMA(KP2_000000000, T2p, T2e);
ro[WS(os, 12)] = T29 - T2q;
ro[WS(os, 1)] = T29 + T2q;
{
E T2v, T2C, T2P, T2Q;
T2v = T2t - T2u;
T2C = T2y - T2B;
ro[WS(os, 10)] = T2v - T2C;
ro[WS(os, 4)] = T2v + T2C;
T2P = T28 - T23;
T2Q = FMS(KP2_000000000, T2K, T2J);
ro[WS(os, 5)] = T2P - T2Q;
ro[WS(os, 8)] = T2P + T2Q;
}
T2N = T2F - T2G;
T2O = T2L - T2I;
ro[WS(os, 11)] = T2N - T2O;
ro[WS(os, 6)] = T2N + T2O;
{
E T2H, T2M, T2D, T2E;
T2H = T2F + T2G;
T2M = T2I + T2L;
ro[WS(os, 7)] = T2H - T2M;
ro[WS(os, 2)] = T2H + T2M;
T2D = T2t + T2u;
T2E = T2y + T2B;
ro[WS(os, 3)] = T2D - T2E;
ro[WS(os, 9)] = T2D + T2E;
}
}
}
}
}
}
static const kdft_desc desc = { 13, "n1_13", { 138, 30, 38, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
}
#endif
+513
View File
@@ -0,0 +1,513 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
/*
* This function contains 148 FP additions, 84 FP multiplications,
* (or, 64 additions, 0 multiplications, 84 fused multiply/add),
* 67 stack variables, 6 constants, and 56 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
E T3, Tp, T1b, T1x, T1i, T1L, T1M, T1j, T1k, T1K, Ta, To, Th, Tz, T14;
E TZ, Ts, Ty, Tv, T1Z, T2c, T27, TI, T23, T24, TP, TW, T22, T1c, T1e;
E T1d, T1f, T1s, T1n, T1A, T1G, T1D, T1H, T1U, T1P;
{
E T1, T2, T19, T1a;
T1 = ri[0];
T2 = ri[WS(is, 7)];
T3 = T1 - T2;
Tp = T1 + T2;
T19 = ii[0];
T1a = ii[WS(is, 7)];
T1b = T19 - T1a;
T1x = T19 + T1a;
}
{
E T6, Tq, T9, Tr, Tn, Tx, Tk, Tw, Tg, Tu, Td, Tt;
{
E T4, T5, Ti, Tj;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 9)];
T6 = T4 - T5;
Tq = T4 + T5;
{
E T7, T8, Tl, Tm;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 5)];
T9 = T7 - T8;
Tr = T7 + T8;
Tl = ri[WS(is, 8)];
Tm = ri[WS(is, 1)];
Tn = Tl - Tm;
Tx = Tl + Tm;
}
Ti = ri[WS(is, 6)];
Tj = ri[WS(is, 13)];
Tk = Ti - Tj;
Tw = Ti + Tj;
{
E Te, Tf, Tb, Tc;
Te = ri[WS(is, 10)];
Tf = ri[WS(is, 3)];
Tg = Te - Tf;
Tu = Te + Tf;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 11)];
Td = Tb - Tc;
Tt = Tb + Tc;
}
}
T1i = Tn - Tk;
T1L = Tt - Tu;
T1M = Tr - Tq;
T1j = Tg - Td;
T1k = T9 - T6;
T1K = Tw - Tx;
Ta = T6 + T9;
To = Tk + Tn;
Th = Td + Tg;
Tz = FNMS(KP356895867, Th, Ta);
T14 = FNMS(KP356895867, To, Th);
TZ = FNMS(KP356895867, Ta, To);
Ts = Tq + Tr;
Ty = Tw + Tx;
Tv = Tt + Tu;
T1Z = FNMS(KP356895867, Ts, Ty);
T2c = FNMS(KP356895867, Ty, Tv);
T27 = FNMS(KP356895867, Tv, Ts);
}
{
E TE, T1B, TH, T1C, TV, T1F, TS, T1E, TO, T1z, TL, T1y;
{
E TC, TD, TQ, TR;
TC = ii[WS(is, 4)];
TD = ii[WS(is, 11)];
TE = TC - TD;
T1B = TC + TD;
{
E TF, TG, TT, TU;
TF = ii[WS(is, 10)];
TG = ii[WS(is, 3)];
TH = TF - TG;
T1C = TF + TG;
TT = ii[WS(is, 8)];
TU = ii[WS(is, 1)];
TV = TT - TU;
T1F = TT + TU;
}
TQ = ii[WS(is, 6)];
TR = ii[WS(is, 13)];
TS = TQ - TR;
T1E = TQ + TR;
{
E TM, TN, TJ, TK;
TM = ii[WS(is, 12)];
TN = ii[WS(is, 5)];
TO = TM - TN;
T1z = TM + TN;
TJ = ii[WS(is, 2)];
TK = ii[WS(is, 9)];
TL = TJ - TK;
T1y = TJ + TK;
}
}
TI = TE - TH;
T23 = T1F - T1E;
T24 = T1C - T1B;
TP = TL - TO;
TW = TS - TV;
T22 = T1y - T1z;
T1c = TL + TO;
T1e = TS + TV;
T1d = TE + TH;
T1f = FNMS(KP356895867, T1e, T1d);
T1s = FNMS(KP356895867, T1d, T1c);
T1n = FNMS(KP356895867, T1c, T1e);
T1A = T1y + T1z;
T1G = T1E + T1F;
T1D = T1B + T1C;
T1H = FNMS(KP356895867, T1G, T1D);
T1U = FNMS(KP356895867, T1D, T1A);
T1P = FNMS(KP356895867, T1A, T1G);
}
ro[WS(os, 7)] = T3 + Ta + Th + To;
io[WS(os, 7)] = T1b + T1c + T1d + T1e;
ro[0] = Tp + Ts + Tv + Ty;
io[0] = T1x + T1A + T1D + T1G;
{
E TB, TY, TA, TX;
TA = FNMS(KP692021471, Tz, To);
TB = FNMS(KP900968867, TA, T3);
TX = FMA(KP554958132, TW, TP);
TY = FMA(KP801937735, TX, TI);
ro[WS(os, 13)] = FNMS(KP974927912, TY, TB);
ro[WS(os, 1)] = FMA(KP974927912, TY, TB);
}
{
E T1u, T1w, T1t, T1v;
T1t = FNMS(KP692021471, T1s, T1e);
T1u = FNMS(KP900968867, T1t, T1b);
T1v = FMA(KP554958132, T1i, T1k);
T1w = FMA(KP801937735, T1v, T1j);
io[WS(os, 1)] = FMA(KP974927912, T1w, T1u);
io[WS(os, 13)] = FNMS(KP974927912, T1w, T1u);
}
{
E T11, T13, T10, T12;
T10 = FNMS(KP692021471, TZ, Th);
T11 = FNMS(KP900968867, T10, T3);
T12 = FMA(KP554958132, TI, TW);
T13 = FNMS(KP801937735, T12, TP);
ro[WS(os, 5)] = FNMS(KP974927912, T13, T11);
ro[WS(os, 9)] = FMA(KP974927912, T13, T11);
}
{
E T1p, T1r, T1o, T1q;
T1o = FNMS(KP692021471, T1n, T1d);
T1p = FNMS(KP900968867, T1o, T1b);
T1q = FMA(KP554958132, T1j, T1i);
T1r = FNMS(KP801937735, T1q, T1k);
io[WS(os, 5)] = FNMS(KP974927912, T1r, T1p);
io[WS(os, 9)] = FMA(KP974927912, T1r, T1p);
}
{
E T16, T18, T15, T17;
T15 = FNMS(KP692021471, T14, Ta);
T16 = FNMS(KP900968867, T15, T3);
T17 = FNMS(KP554958132, TP, TI);
T18 = FNMS(KP801937735, T17, TW);
ro[WS(os, 11)] = FNMS(KP974927912, T18, T16);
ro[WS(os, 3)] = FMA(KP974927912, T18, T16);
}
{
E T1h, T1m, T1g, T1l;
T1g = FNMS(KP692021471, T1f, T1c);
T1h = FNMS(KP900968867, T1g, T1b);
T1l = FNMS(KP554958132, T1k, T1j);
T1m = FNMS(KP801937735, T1l, T1i);
io[WS(os, 3)] = FMA(KP974927912, T1m, T1h);
io[WS(os, 11)] = FNMS(KP974927912, T1m, T1h);
}
{
E T1J, T1O, T1I, T1N;
T1I = FNMS(KP692021471, T1H, T1A);
T1J = FNMS(KP900968867, T1I, T1x);
T1N = FMA(KP554958132, T1M, T1L);
T1O = FNMS(KP801937735, T1N, T1K);
io[WS(os, 4)] = FMA(KP974927912, T1O, T1J);
io[WS(os, 10)] = FNMS(KP974927912, T1O, T1J);
}
{
E T2e, T2g, T2d, T2f;
T2d = FNMS(KP692021471, T2c, Ts);
T2e = FNMS(KP900968867, T2d, Tp);
T2f = FMA(KP554958132, T22, T24);
T2g = FNMS(KP801937735, T2f, T23);
ro[WS(os, 10)] = FNMS(KP974927912, T2g, T2e);
ro[WS(os, 4)] = FMA(KP974927912, T2g, T2e);
}
{
E T1R, T1T, T1Q, T1S;
T1Q = FNMS(KP692021471, T1P, T1D);
T1R = FNMS(KP900968867, T1Q, T1x);
T1S = FMA(KP554958132, T1L, T1K);
T1T = FMA(KP801937735, T1S, T1M);
io[WS(os, 2)] = FMA(KP974927912, T1T, T1R);
io[WS(os, 12)] = FNMS(KP974927912, T1T, T1R);
}
{
E T21, T26, T20, T25;
T20 = FNMS(KP692021471, T1Z, Tv);
T21 = FNMS(KP900968867, T20, Tp);
T25 = FMA(KP554958132, T24, T23);
T26 = FMA(KP801937735, T25, T22);
ro[WS(os, 12)] = FNMS(KP974927912, T26, T21);
ro[WS(os, 2)] = FMA(KP974927912, T26, T21);
}
{
E T1W, T1Y, T1V, T1X;
T1V = FNMS(KP692021471, T1U, T1G);
T1W = FNMS(KP900968867, T1V, T1x);
T1X = FNMS(KP554958132, T1K, T1M);
T1Y = FNMS(KP801937735, T1X, T1L);
io[WS(os, 6)] = FMA(KP974927912, T1Y, T1W);
io[WS(os, 8)] = FNMS(KP974927912, T1Y, T1W);
}
{
E T29, T2b, T28, T2a;
T28 = FNMS(KP692021471, T27, Ty);
T29 = FNMS(KP900968867, T28, Tp);
T2a = FNMS(KP554958132, T23, T22);
T2b = FNMS(KP801937735, T2a, T24);
ro[WS(os, 8)] = FNMS(KP974927912, T2b, T29);
ro[WS(os, 6)] = FMA(KP974927912, T2b, T29);
}
}
}
}
static const kdft_desc desc = { 14, "n1_14", { 64, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
/*
* This function contains 148 FP additions, 72 FP multiplications,
* (or, 100 additions, 24 multiplications, 48 fused multiply/add),
* 43 stack variables, 6 constants, and 56 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
E T3, Tp, T16, T1f, Ta, T1q, Ts, T10, TG, T1z, T19, T1i, Th, T1s, Tv;
E T12, TU, T1B, T17, T1o, To, T1r, Ty, T11, TN, T1A, T18, T1l;
{
E T1, T2, T14, T15;
T1 = ri[0];
T2 = ri[WS(is, 7)];
T3 = T1 - T2;
Tp = T1 + T2;
T14 = ii[0];
T15 = ii[WS(is, 7)];
T16 = T14 - T15;
T1f = T14 + T15;
}
{
E T6, Tq, T9, Tr;
{
E T4, T5, T7, T8;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 9)];
T6 = T4 - T5;
Tq = T4 + T5;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 5)];
T9 = T7 - T8;
Tr = T7 + T8;
}
Ta = T6 + T9;
T1q = Tr - Tq;
Ts = Tq + Tr;
T10 = T9 - T6;
}
{
E TC, T1g, TF, T1h;
{
E TA, TB, TD, TE;
TA = ii[WS(is, 2)];
TB = ii[WS(is, 9)];
TC = TA - TB;
T1g = TA + TB;
TD = ii[WS(is, 12)];
TE = ii[WS(is, 5)];
TF = TD - TE;
T1h = TD + TE;
}
TG = TC - TF;
T1z = T1g - T1h;
T19 = TC + TF;
T1i = T1g + T1h;
}
{
E Td, Tt, Tg, Tu;
{
E Tb, Tc, Te, Tf;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 11)];
Td = Tb - Tc;
Tt = Tb + Tc;
Te = ri[WS(is, 10)];
Tf = ri[WS(is, 3)];
Tg = Te - Tf;
Tu = Te + Tf;
}
Th = Td + Tg;
T1s = Tt - Tu;
Tv = Tt + Tu;
T12 = Tg - Td;
}
{
E TQ, T1m, TT, T1n;
{
E TO, TP, TR, TS;
TO = ii[WS(is, 4)];
TP = ii[WS(is, 11)];
TQ = TO - TP;
T1m = TO + TP;
TR = ii[WS(is, 10)];
TS = ii[WS(is, 3)];
TT = TR - TS;
T1n = TR + TS;
}
TU = TQ - TT;
T1B = T1n - T1m;
T17 = TQ + TT;
T1o = T1m + T1n;
}
{
E Tk, Tw, Tn, Tx;
{
E Ti, Tj, Tl, Tm;
Ti = ri[WS(is, 6)];
Tj = ri[WS(is, 13)];
Tk = Ti - Tj;
Tw = Ti + Tj;
Tl = ri[WS(is, 8)];
Tm = ri[WS(is, 1)];
Tn = Tl - Tm;
Tx = Tl + Tm;
}
To = Tk + Tn;
T1r = Tw - Tx;
Ty = Tw + Tx;
T11 = Tn - Tk;
}
{
E TJ, T1j, TM, T1k;
{
E TH, TI, TK, TL;
TH = ii[WS(is, 6)];
TI = ii[WS(is, 13)];
TJ = TH - TI;
T1j = TH + TI;
TK = ii[WS(is, 8)];
TL = ii[WS(is, 1)];
TM = TK - TL;
T1k = TK + TL;
}
TN = TJ - TM;
T1A = T1k - T1j;
T18 = TJ + TM;
T1l = T1j + T1k;
}
ro[WS(os, 7)] = T3 + Ta + Th + To;
io[WS(os, 7)] = T16 + T19 + T17 + T18;
ro[0] = Tp + Ts + Tv + Ty;
io[0] = T1f + T1i + T1o + T1l;
{
E TV, Tz, T1e, T1d;
TV = FNMS(KP781831482, TN, KP974927912 * TG) - (KP433883739 * TU);
Tz = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
ro[WS(os, 5)] = Tz - TV;
ro[WS(os, 9)] = Tz + TV;
T1e = FNMS(KP781831482, T11, KP974927912 * T10) - (KP433883739 * T12);
T1d = FMA(KP623489801, T18, T16) + FNMA(KP900968867, T17, KP222520933 * T19);
io[WS(os, 5)] = T1d - T1e;
io[WS(os, 9)] = T1e + T1d;
}
{
E TX, TW, T1b, T1c;
TX = FMA(KP781831482, TG, KP974927912 * TU) + (KP433883739 * TN);
TW = FMA(KP623489801, Ta, T3) + FNMA(KP900968867, To, KP222520933 * Th);
ro[WS(os, 13)] = TW - TX;
ro[WS(os, 1)] = TW + TX;
T1b = FMA(KP781831482, T10, KP974927912 * T12) + (KP433883739 * T11);
T1c = FMA(KP623489801, T19, T16) + FNMA(KP900968867, T18, KP222520933 * T17);
io[WS(os, 1)] = T1b + T1c;
io[WS(os, 13)] = T1c - T1b;
}
{
E TZ, TY, T13, T1a;
TZ = FMA(KP433883739, TG, KP974927912 * TN) - (KP781831482 * TU);
TY = FMA(KP623489801, Th, T3) + FNMA(KP222520933, To, KP900968867 * Ta);
ro[WS(os, 11)] = TY - TZ;
ro[WS(os, 3)] = TY + TZ;
T13 = FMA(KP433883739, T10, KP974927912 * T11) - (KP781831482 * T12);
T1a = FMA(KP623489801, T17, T16) + FNMA(KP222520933, T18, KP900968867 * T19);
io[WS(os, 3)] = T13 + T1a;
io[WS(os, 11)] = T1a - T13;
}
{
E T1t, T1p, T1C, T1y;
T1t = FNMS(KP433883739, T1r, KP781831482 * T1q) - (KP974927912 * T1s);
T1p = FMA(KP623489801, T1i, T1f) + FNMA(KP900968867, T1l, KP222520933 * T1o);
io[WS(os, 6)] = T1p - T1t;
io[WS(os, 8)] = T1t + T1p;
T1C = FNMS(KP433883739, T1A, KP781831482 * T1z) - (KP974927912 * T1B);
T1y = FMA(KP623489801, Ts, Tp) + FNMA(KP900968867, Ty, KP222520933 * Tv);
ro[WS(os, 6)] = T1y - T1C;
ro[WS(os, 8)] = T1y + T1C;
}
{
E T1v, T1u, T1E, T1D;
T1v = FMA(KP433883739, T1q, KP781831482 * T1s) - (KP974927912 * T1r);
T1u = FMA(KP623489801, T1o, T1f) + FNMA(KP222520933, T1l, KP900968867 * T1i);
io[WS(os, 4)] = T1u - T1v;
io[WS(os, 10)] = T1v + T1u;
T1E = FMA(KP433883739, T1z, KP781831482 * T1B) - (KP974927912 * T1A);
T1D = FMA(KP623489801, Tv, Tp) + FNMA(KP222520933, Ty, KP900968867 * Ts);
ro[WS(os, 4)] = T1D - T1E;
ro[WS(os, 10)] = T1D + T1E;
}
{
E T1w, T1x, T1G, T1F;
T1w = FMA(KP974927912, T1q, KP433883739 * T1s) + (KP781831482 * T1r);
T1x = FMA(KP623489801, T1l, T1f) + FNMA(KP900968867, T1o, KP222520933 * T1i);
io[WS(os, 2)] = T1w + T1x;
io[WS(os, 12)] = T1x - T1w;
T1G = FMA(KP974927912, T1z, KP433883739 * T1B) + (KP781831482 * T1A);
T1F = FMA(KP623489801, Ty, Tp) + FNMA(KP900968867, Tv, KP222520933 * Ts);
ro[WS(os, 12)] = T1F - T1G;
ro[WS(os, 2)] = T1F + T1G;
}
}
}
}
static const kdft_desc desc = { 14, "n1_14", { 100, 24, 48, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
}
#endif
+554
View File
@@ -0,0 +1,554 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
/*
* This function contains 156 FP additions, 84 FP multiplications,
* (or, 72 additions, 0 multiplications, 84 fused multiply/add),
* 69 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
E T5, T2l, Tx, TV, T1z, T1X, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
E T1O, T1P, T1Z, T1l, T1q, T1B, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
E T2f, T2g, T2m, T1R, T1S, T1Y, T1a, T1f, T1A, TW, TX, TY;
{
E T1, T1v, T4, T1y, Tw, T1w, Tt, T1x;
T1 = ri[0];
T1v = ii[0];
{
E T2, T3, Tu, Tv;
T2 = ri[WS(is, 5)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
T1y = T3 - T2;
Tu = ii[WS(is, 5)];
Tv = ii[WS(is, 10)];
Tw = Tu - Tv;
T1w = Tu + Tv;
}
T5 = T1 + T4;
T2l = T1v + T1w;
Tt = FNMS(KP500000000, T4, T1);
Tx = FNMS(KP866025403, Tw, Tt);
TV = FMA(KP866025403, Tw, Tt);
T1x = FNMS(KP500000000, T1w, T1v);
T1z = FMA(KP866025403, T1y, T1x);
T1X = FNMS(KP866025403, T1y, T1x);
}
{
E Th, Tk, TJ, T1k, T1h, T1i, TM, T1j, Tm, Tp, TO, T1p, T1m, T1n, TR;
E T1o;
{
E Ti, Tj, TK, TL;
Th = ri[WS(is, 6)];
Ti = ri[WS(is, 11)];
Tj = ri[WS(is, 1)];
Tk = Ti + Tj;
TJ = FNMS(KP500000000, Tk, Th);
T1k = Tj - Ti;
T1h = ii[WS(is, 6)];
TK = ii[WS(is, 11)];
TL = ii[WS(is, 1)];
T1i = TK + TL;
TM = TK - TL;
T1j = FNMS(KP500000000, T1i, T1h);
}
{
E Tn, To, TP, TQ;
Tm = ri[WS(is, 9)];
Tn = ri[WS(is, 14)];
To = ri[WS(is, 4)];
Tp = Tn + To;
TO = FNMS(KP500000000, Tp, Tm);
T1p = To - Tn;
T1m = ii[WS(is, 9)];
TP = ii[WS(is, 14)];
TQ = ii[WS(is, 4)];
T1n = TP + TQ;
TR = TP - TQ;
T1o = FNMS(KP500000000, T1n, T1m);
}
Tl = Th + Tk;
Tq = Tm + Tp;
Tr = Tl + Tq;
TN = FNMS(KP866025403, TM, TJ);
TS = FNMS(KP866025403, TR, TO);
TT = TN + TS;
T2c = T1h + T1i;
T2d = T1m + T1n;
T2n = T2c + T2d;
T1O = FNMS(KP866025403, T1k, T1j);
T1P = FNMS(KP866025403, T1p, T1o);
T1Z = T1O + T1P;
T1l = FMA(KP866025403, T1k, T1j);
T1q = FMA(KP866025403, T1p, T1o);
T1B = T1l + T1q;
TZ = FMA(KP866025403, TM, TJ);
T10 = FMA(KP866025403, TR, TO);
T11 = TZ + T10;
}
{
E T6, T9, Ty, T19, T16, T17, TB, T18, Tb, Te, TD, T1e, T1b, T1c, TG;
E T1d;
{
E T7, T8, Tz, TA;
T6 = ri[WS(is, 3)];
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 13)];
T9 = T7 + T8;
Ty = FNMS(KP500000000, T9, T6);
T19 = T8 - T7;
T16 = ii[WS(is, 3)];
Tz = ii[WS(is, 8)];
TA = ii[WS(is, 13)];
T17 = Tz + TA;
TB = Tz - TA;
T18 = FNMS(KP500000000, T17, T16);
}
{
E Tc, Td, TE, TF;
Tb = ri[WS(is, 12)];
Tc = ri[WS(is, 2)];
Td = ri[WS(is, 7)];
Te = Tc + Td;
TD = FNMS(KP500000000, Te, Tb);
T1e = Td - Tc;
T1b = ii[WS(is, 12)];
TE = ii[WS(is, 2)];
TF = ii[WS(is, 7)];
T1c = TE + TF;
TG = TE - TF;
T1d = FNMS(KP500000000, T1c, T1b);
}
Ta = T6 + T9;
Tf = Tb + Te;
Tg = Ta + Tf;
TC = FNMS(KP866025403, TB, Ty);
TH = FNMS(KP866025403, TG, TD);
TI = TC + TH;
T2f = T16 + T17;
T2g = T1b + T1c;
T2m = T2f + T2g;
T1R = FNMS(KP866025403, T19, T18);
T1S = FNMS(KP866025403, T1e, T1d);
T1Y = T1R + T1S;
T1a = FMA(KP866025403, T19, T18);
T1f = FMA(KP866025403, T1e, T1d);
T1A = T1a + T1f;
TW = FMA(KP866025403, TB, Ty);
TX = FMA(KP866025403, TG, TD);
TY = TW + TX;
}
{
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
T2a = Tg - Tr;
Ts = Tg + Tr;
T29 = FNMS(KP250000000, Ts, T5);
T2e = T2c - T2d;
T2h = T2f - T2g;
T2i = FNMS(KP618033988, T2h, T2e);
T2k = FMA(KP618033988, T2e, T2h);
ro[0] = T5 + Ts;
T2j = FMA(KP559016994, T2a, T29);
ro[WS(os, 9)] = FNMS(KP951056516, T2k, T2j);
ro[WS(os, 6)] = FMA(KP951056516, T2k, T2j);
T2b = FNMS(KP559016994, T2a, T29);
ro[WS(os, 12)] = FNMS(KP951056516, T2i, T2b);
ro[WS(os, 3)] = FMA(KP951056516, T2i, T2b);
}
{
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
T2q = T2m - T2n;
T2o = T2m + T2n;
T2p = FNMS(KP250000000, T2o, T2l);
T2s = Tl - Tq;
T2t = Ta - Tf;
T2u = FNMS(KP618033988, T2t, T2s);
T2w = FMA(KP618033988, T2s, T2t);
io[0] = T2l + T2o;
T2v = FMA(KP559016994, T2q, T2p);
io[WS(os, 6)] = FNMS(KP951056516, T2w, T2v);
io[WS(os, 9)] = FMA(KP951056516, T2w, T2v);
T2r = FNMS(KP559016994, T2q, T2p);
io[WS(os, 3)] = FNMS(KP951056516, T2u, T2r);
io[WS(os, 12)] = FMA(KP951056516, T2u, T2r);
}
{
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
T1M = TI - TT;
TU = TI + TT;
T1L = FNMS(KP250000000, TU, Tx);
T1Q = T1O - T1P;
T1T = T1R - T1S;
T1U = FNMS(KP618033988, T1T, T1Q);
T1W = FMA(KP618033988, T1Q, T1T);
ro[WS(os, 5)] = Tx + TU;
T1V = FMA(KP559016994, T1M, T1L);
ro[WS(os, 14)] = FNMS(KP951056516, T1W, T1V);
ro[WS(os, 11)] = FMA(KP951056516, T1W, T1V);
T1N = FNMS(KP559016994, T1M, T1L);
ro[WS(os, 2)] = FNMS(KP951056516, T1U, T1N);
ro[WS(os, 8)] = FMA(KP951056516, T1U, T1N);
}
{
E T22, T20, T21, T26, T28, T24, T25, T27, T23;
T22 = T1Y - T1Z;
T20 = T1Y + T1Z;
T21 = FNMS(KP250000000, T20, T1X);
T24 = TN - TS;
T25 = TC - TH;
T26 = FNMS(KP618033988, T25, T24);
T28 = FMA(KP618033988, T24, T25);
io[WS(os, 5)] = T1X + T20;
T27 = FMA(KP559016994, T22, T21);
io[WS(os, 11)] = FNMS(KP951056516, T28, T27);
io[WS(os, 14)] = FMA(KP951056516, T28, T27);
T23 = FNMS(KP559016994, T22, T21);
io[WS(os, 2)] = FMA(KP951056516, T26, T23);
io[WS(os, 8)] = FNMS(KP951056516, T26, T23);
}
{
E T1E, T1C, T1D, T1I, T1K, T1G, T1H, T1J, T1F;
T1E = T1A - T1B;
T1C = T1A + T1B;
T1D = FNMS(KP250000000, T1C, T1z);
T1G = TW - TX;
T1H = TZ - T10;
T1I = FMA(KP618033988, T1H, T1G);
T1K = FNMS(KP618033988, T1G, T1H);
io[WS(os, 10)] = T1z + T1C;
T1J = FNMS(KP559016994, T1E, T1D);
io[WS(os, 7)] = FMA(KP951056516, T1K, T1J);
io[WS(os, 13)] = FNMS(KP951056516, T1K, T1J);
T1F = FMA(KP559016994, T1E, T1D);
io[WS(os, 1)] = FNMS(KP951056516, T1I, T1F);
io[WS(os, 4)] = FMA(KP951056516, T1I, T1F);
}
{
E T14, T12, T13, T1s, T1u, T1g, T1r, T1t, T15;
T14 = TY - T11;
T12 = TY + T11;
T13 = FNMS(KP250000000, T12, TV);
T1g = T1a - T1f;
T1r = T1l - T1q;
T1s = FMA(KP618033988, T1r, T1g);
T1u = FNMS(KP618033988, T1g, T1r);
ro[WS(os, 10)] = TV + T12;
T1t = FNMS(KP559016994, T14, T13);
ro[WS(os, 7)] = FNMS(KP951056516, T1u, T1t);
ro[WS(os, 13)] = FMA(KP951056516, T1u, T1t);
T15 = FMA(KP559016994, T14, T13);
ro[WS(os, 4)] = FNMS(KP951056516, T1s, T15);
ro[WS(os, 1)] = FMA(KP951056516, T1s, T15);
}
}
}
}
static const kdft_desc desc = { 15, "n1_15", { 72, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
/*
* This function contains 156 FP additions, 56 FP multiplications,
* (or, 128 additions, 28 multiplications, 28 fused multiply/add),
* 69 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
E T5, T2l, Tx, TV, T1C, T20, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
E T1O, T1P, T22, T1l, T1q, T1w, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
E T2f, T2g, T2m, T1R, T1S, T21, T1a, T1f, T1v, TW, TX, TY;
{
E T1, T1z, T4, T1y, Tw, T1A, Tt, T1B;
T1 = ri[0];
T1z = ii[0];
{
E T2, T3, Tu, Tv;
T2 = ri[WS(is, 5)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
T1y = KP866025403 * (T3 - T2);
Tu = ii[WS(is, 5)];
Tv = ii[WS(is, 10)];
Tw = KP866025403 * (Tu - Tv);
T1A = Tu + Tv;
}
T5 = T1 + T4;
T2l = T1z + T1A;
Tt = FNMS(KP500000000, T4, T1);
Tx = Tt - Tw;
TV = Tt + Tw;
T1B = FNMS(KP500000000, T1A, T1z);
T1C = T1y + T1B;
T20 = T1B - T1y;
}
{
E Th, Tk, TJ, T1h, T1i, T1j, TM, T1k, Tm, Tp, TO, T1m, T1n, T1o, TR;
E T1p;
{
E Ti, Tj, TK, TL;
Th = ri[WS(is, 6)];
Ti = ri[WS(is, 11)];
Tj = ri[WS(is, 1)];
Tk = Ti + Tj;
TJ = FNMS(KP500000000, Tk, Th);
T1h = KP866025403 * (Tj - Ti);
T1i = ii[WS(is, 6)];
TK = ii[WS(is, 11)];
TL = ii[WS(is, 1)];
T1j = TK + TL;
TM = KP866025403 * (TK - TL);
T1k = FNMS(KP500000000, T1j, T1i);
}
{
E Tn, To, TP, TQ;
Tm = ri[WS(is, 9)];
Tn = ri[WS(is, 14)];
To = ri[WS(is, 4)];
Tp = Tn + To;
TO = FNMS(KP500000000, Tp, Tm);
T1m = KP866025403 * (To - Tn);
T1n = ii[WS(is, 9)];
TP = ii[WS(is, 14)];
TQ = ii[WS(is, 4)];
T1o = TP + TQ;
TR = KP866025403 * (TP - TQ);
T1p = FNMS(KP500000000, T1o, T1n);
}
Tl = Th + Tk;
Tq = Tm + Tp;
Tr = Tl + Tq;
TN = TJ - TM;
TS = TO - TR;
TT = TN + TS;
T2c = T1i + T1j;
T2d = T1n + T1o;
T2n = T2c + T2d;
T1O = T1k - T1h;
T1P = T1p - T1m;
T22 = T1O + T1P;
T1l = T1h + T1k;
T1q = T1m + T1p;
T1w = T1l + T1q;
TZ = TJ + TM;
T10 = TO + TR;
T11 = TZ + T10;
}
{
E T6, T9, Ty, T16, T17, T18, TB, T19, Tb, Te, TD, T1b, T1c, T1d, TG;
E T1e;
{
E T7, T8, Tz, TA;
T6 = ri[WS(is, 3)];
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 13)];
T9 = T7 + T8;
Ty = FNMS(KP500000000, T9, T6);
T16 = KP866025403 * (T8 - T7);
T17 = ii[WS(is, 3)];
Tz = ii[WS(is, 8)];
TA = ii[WS(is, 13)];
T18 = Tz + TA;
TB = KP866025403 * (Tz - TA);
T19 = FNMS(KP500000000, T18, T17);
}
{
E Tc, Td, TE, TF;
Tb = ri[WS(is, 12)];
Tc = ri[WS(is, 2)];
Td = ri[WS(is, 7)];
Te = Tc + Td;
TD = FNMS(KP500000000, Te, Tb);
T1b = KP866025403 * (Td - Tc);
T1c = ii[WS(is, 12)];
TE = ii[WS(is, 2)];
TF = ii[WS(is, 7)];
T1d = TE + TF;
TG = KP866025403 * (TE - TF);
T1e = FNMS(KP500000000, T1d, T1c);
}
Ta = T6 + T9;
Tf = Tb + Te;
Tg = Ta + Tf;
TC = Ty - TB;
TH = TD - TG;
TI = TC + TH;
T2f = T17 + T18;
T2g = T1c + T1d;
T2m = T2f + T2g;
T1R = T19 - T16;
T1S = T1e - T1b;
T21 = T1R + T1S;
T1a = T16 + T19;
T1f = T1b + T1e;
T1v = T1a + T1f;
TW = Ty + TB;
TX = TD + TG;
TY = TW + TX;
}
{
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
T2a = KP559016994 * (Tg - Tr);
Ts = Tg + Tr;
T29 = FNMS(KP250000000, Ts, T5);
T2e = T2c - T2d;
T2h = T2f - T2g;
T2i = FNMS(KP587785252, T2h, KP951056516 * T2e);
T2k = FMA(KP951056516, T2h, KP587785252 * T2e);
ro[0] = T5 + Ts;
T2j = T2a + T29;
ro[WS(os, 9)] = T2j - T2k;
ro[WS(os, 6)] = T2j + T2k;
T2b = T29 - T2a;
ro[WS(os, 12)] = T2b - T2i;
ro[WS(os, 3)] = T2b + T2i;
}
{
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
T2q = KP559016994 * (T2m - T2n);
T2o = T2m + T2n;
T2p = FNMS(KP250000000, T2o, T2l);
T2s = Tl - Tq;
T2t = Ta - Tf;
T2u = FNMS(KP587785252, T2t, KP951056516 * T2s);
T2w = FMA(KP951056516, T2t, KP587785252 * T2s);
io[0] = T2l + T2o;
T2v = T2q + T2p;
io[WS(os, 6)] = T2v - T2w;
io[WS(os, 9)] = T2w + T2v;
T2r = T2p - T2q;
io[WS(os, 3)] = T2r - T2u;
io[WS(os, 12)] = T2u + T2r;
}
{
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
T1M = KP559016994 * (TI - TT);
TU = TI + TT;
T1L = FNMS(KP250000000, TU, Tx);
T1Q = T1O - T1P;
T1T = T1R - T1S;
T1U = FNMS(KP587785252, T1T, KP951056516 * T1Q);
T1W = FMA(KP951056516, T1T, KP587785252 * T1Q);
ro[WS(os, 5)] = Tx + TU;
T1V = T1M + T1L;
ro[WS(os, 14)] = T1V - T1W;
ro[WS(os, 11)] = T1V + T1W;
T1N = T1L - T1M;
ro[WS(os, 2)] = T1N - T1U;
ro[WS(os, 8)] = T1N + T1U;
}
{
E T25, T23, T24, T1Z, T28, T1X, T1Y, T27, T26;
T25 = KP559016994 * (T21 - T22);
T23 = T21 + T22;
T24 = FNMS(KP250000000, T23, T20);
T1X = TN - TS;
T1Y = TC - TH;
T1Z = FNMS(KP587785252, T1Y, KP951056516 * T1X);
T28 = FMA(KP951056516, T1Y, KP587785252 * T1X);
io[WS(os, 5)] = T20 + T23;
T27 = T25 + T24;
io[WS(os, 11)] = T27 - T28;
io[WS(os, 14)] = T28 + T27;
T26 = T24 - T25;
io[WS(os, 2)] = T1Z + T26;
io[WS(os, 8)] = T26 - T1Z;
}
{
E T1x, T1D, T1E, T1I, T1J, T1G, T1H, T1K, T1F;
T1x = KP559016994 * (T1v - T1w);
T1D = T1v + T1w;
T1E = FNMS(KP250000000, T1D, T1C);
T1G = TW - TX;
T1H = TZ - T10;
T1I = FMA(KP951056516, T1G, KP587785252 * T1H);
T1J = FNMS(KP587785252, T1G, KP951056516 * T1H);
io[WS(os, 10)] = T1C + T1D;
T1K = T1E - T1x;
io[WS(os, 7)] = T1J + T1K;
io[WS(os, 13)] = T1K - T1J;
T1F = T1x + T1E;
io[WS(os, 1)] = T1F - T1I;
io[WS(os, 4)] = T1I + T1F;
}
{
E T13, T12, T14, T1s, T1u, T1g, T1r, T1t, T15;
T13 = KP559016994 * (TY - T11);
T12 = TY + T11;
T14 = FNMS(KP250000000, T12, TV);
T1g = T1a - T1f;
T1r = T1l - T1q;
T1s = FMA(KP951056516, T1g, KP587785252 * T1r);
T1u = FNMS(KP587785252, T1g, KP951056516 * T1r);
ro[WS(os, 10)] = TV + T12;
T1t = T14 - T13;
ro[WS(os, 7)] = T1t - T1u;
ro[WS(os, 13)] = T1t + T1u;
T15 = T13 + T14;
ro[WS(os, 4)] = T15 - T1s;
ro[WS(os, 1)] = T15 + T1s;
}
}
}
}
static const kdft_desc desc = { 15, "n1_15", { 128, 28, 28, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
}
#endif
+560
View File
@@ -0,0 +1,560 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:25 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
/*
* This function contains 144 FP additions, 40 FP multiplications,
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
* 50 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
E T1U, T1A;
{
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
{
E T1, T2, Tw, Tx;
T1 = ri[0];
T2 = ri[WS(is, 8)];
T3 = T1 + T2;
TL = T1 - T2;
Tw = ii[0];
Tx = ii[WS(is, 8)];
Ty = Tw + Tx;
T1k = Tw - Tx;
}
{
E T4, T5, Tz, TA;
T4 = ri[WS(is, 4)];
T5 = ri[WS(is, 12)];
T6 = T4 + T5;
T1j = T4 - T5;
Tz = ii[WS(is, 4)];
TA = ii[WS(is, 12)];
TB = Tz + TA;
TM = Tz - TA;
}
T7 = T3 + T6;
T1R = T3 - T6;
T25 = Ty - TB;
TC = Ty + TB;
TN = TL - TM;
T1x = TL + TM;
T1H = T1k - T1j;
T1l = T1j + T1k;
}
{
E Tp, T1c, T1a, T20, Ts, T17, T1f, T21;
{
E Tn, To, T18, T19;
Tn = ri[WS(is, 15)];
To = ri[WS(is, 7)];
Tp = Tn + To;
T1c = Tn - To;
T18 = ii[WS(is, 15)];
T19 = ii[WS(is, 7)];
T1a = T18 - T19;
T20 = T18 + T19;
}
{
E Tq, Tr, T1d, T1e;
Tq = ri[WS(is, 3)];
Tr = ri[WS(is, 11)];
Ts = Tq + Tr;
T17 = Tq - Tr;
T1d = ii[WS(is, 3)];
T1e = ii[WS(is, 11)];
T1f = T1d - T1e;
T21 = T1d + T1e;
}
Tt = Tp + Ts;
T22 = T20 - T21;
T2h = T20 + T21;
T1b = T17 + T1a;
T1g = T1c - T1f;
T1E = T1a - T17;
T1Z = Tp - Ts;
T1D = T1c + T1f;
}
{
E Ta, TP, TF, TO, Td, TR, TI, TS;
{
E T8, T9, TD, TE;
T8 = ri[WS(is, 2)];
T9 = ri[WS(is, 10)];
Ta = T8 + T9;
TP = T8 - T9;
TD = ii[WS(is, 2)];
TE = ii[WS(is, 10)];
TF = TD + TE;
TO = TD - TE;
}
{
E Tb, Tc, TG, TH;
Tb = ri[WS(is, 14)];
Tc = ri[WS(is, 6)];
Td = Tb + Tc;
TR = Tb - Tc;
TG = ii[WS(is, 14)];
TH = ii[WS(is, 6)];
TI = TG + TH;
TS = TG - TH;
}
Te = Ta + Td;
T1S = TF - TI;
T26 = Td - Ta;
TJ = TF + TI;
TQ = TO - TP;
T1m = TR - TS;
T1n = TP + TO;
TT = TR + TS;
}
{
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
{
E Tg, Th, TX, TY;
Tg = ri[WS(is, 1)];
Th = ri[WS(is, 9)];
Ti = Tg + Th;
T11 = Tg - Th;
TX = ii[WS(is, 1)];
TY = ii[WS(is, 9)];
TZ = TX - TY;
T1V = TX + TY;
}
{
E Tj, Tk, T12, T13;
Tj = ri[WS(is, 5)];
Tk = ri[WS(is, 13)];
Tl = Tj + Tk;
TW = Tj - Tk;
T12 = ii[WS(is, 5)];
T13 = ii[WS(is, 13)];
T14 = T12 - T13;
T1W = T12 + T13;
}
Tm = Ti + Tl;
T1X = T1V - T1W;
T2g = T1V + T1W;
T10 = TW + TZ;
T15 = T11 - T14;
T1B = TZ - TW;
T1U = Ti - Tl;
T1A = T11 + T14;
}
{
E Tf, Tu, T2j, T2k;
Tf = T7 + Te;
Tu = Tm + Tt;
ro[WS(os, 8)] = Tf - Tu;
ro[0] = Tf + Tu;
T2j = TC + TJ;
T2k = T2g + T2h;
io[WS(os, 8)] = T2j - T2k;
io[0] = T2j + T2k;
}
{
E Tv, TK, T2f, T2i;
Tv = Tt - Tm;
TK = TC - TJ;
io[WS(os, 4)] = Tv + TK;
io[WS(os, 12)] = TK - Tv;
T2f = T7 - Te;
T2i = T2g - T2h;
ro[WS(os, 12)] = T2f - T2i;
ro[WS(os, 4)] = T2f + T2i;
}
{
E T1T, T27, T24, T28, T1Y, T23;
T1T = T1R + T1S;
T27 = T25 - T26;
T1Y = T1U + T1X;
T23 = T1Z - T22;
T24 = T1Y + T23;
T28 = T23 - T1Y;
ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T);
io[WS(os, 6)] = FMA(KP707106781, T28, T27);
ro[WS(os, 2)] = FMA(KP707106781, T24, T1T);
io[WS(os, 14)] = FNMS(KP707106781, T28, T27);
}
{
E T29, T2d, T2c, T2e, T2a, T2b;
T29 = T1R - T1S;
T2d = T26 + T25;
T2a = T1X - T1U;
T2b = T1Z + T22;
T2c = T2a - T2b;
T2e = T2a + T2b;
ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29);
io[WS(os, 2)] = FMA(KP707106781, T2e, T2d);
ro[WS(os, 6)] = FMA(KP707106781, T2c, T29);
io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d);
}
{
E TV, T1v, T1p, T1r, T1i, T1q, T1u, T1w, TU, T1o;
TU = TQ - TT;
TV = FMA(KP707106781, TU, TN);
T1v = FNMS(KP707106781, TU, TN);
T1o = T1m - T1n;
T1p = FNMS(KP707106781, T1o, T1l);
T1r = FMA(KP707106781, T1o, T1l);
{
E T16, T1h, T1s, T1t;
T16 = FMA(KP414213562, T15, T10);
T1h = FNMS(KP414213562, T1g, T1b);
T1i = T16 - T1h;
T1q = T16 + T1h;
T1s = FMA(KP414213562, T1b, T1g);
T1t = FNMS(KP414213562, T10, T15);
T1u = T1s - T1t;
T1w = T1t + T1s;
}
ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV);
io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r);
ro[WS(os, 3)] = FMA(KP923879532, T1i, TV);
io[WS(os, 3)] = FMA(KP923879532, T1u, T1r);
io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p);
ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v);
io[WS(os, 15)] = FMA(KP923879532, T1q, T1p);
ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v);
}
{
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
T1y = T1n + T1m;
T1z = FMA(KP707106781, T1y, T1x);
T1L = FNMS(KP707106781, T1y, T1x);
T1I = TQ + TT;
T1J = FNMS(KP707106781, T1I, T1H);
T1P = FMA(KP707106781, T1I, T1H);
{
E T1C, T1F, T1M, T1N;
T1C = FMA(KP414213562, T1B, T1A);
T1F = FNMS(KP414213562, T1E, T1D);
T1G = T1C + T1F;
T1K = T1F - T1C;
T1M = FNMS(KP414213562, T1A, T1B);
T1N = FMA(KP414213562, T1D, T1E);
T1O = T1M - T1N;
T1Q = T1M + T1N;
}
ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z);
io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P);
ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z);
io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P);
io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J);
ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L);
io[WS(os, 5)] = FMA(KP923879532, T1K, T1J);
ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L);
}
}
}
}
static const kdft_desc desc = { 16, "n1_16", { 104, 0, 40, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
/*
* This function contains 144 FP additions, 24 FP multiplications,
* (or, 136 additions, 16 multiplications, 8 fused multiply/add),
* 50 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
E T1U, T1A;
{
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
{
E T1, T2, Tw, Tx;
T1 = ri[0];
T2 = ri[WS(is, 8)];
T3 = T1 + T2;
TL = T1 - T2;
Tw = ii[0];
Tx = ii[WS(is, 8)];
Ty = Tw + Tx;
T1k = Tw - Tx;
}
{
E T4, T5, Tz, TA;
T4 = ri[WS(is, 4)];
T5 = ri[WS(is, 12)];
T6 = T4 + T5;
T1j = T4 - T5;
Tz = ii[WS(is, 4)];
TA = ii[WS(is, 12)];
TB = Tz + TA;
TM = Tz - TA;
}
T7 = T3 + T6;
T1R = T3 - T6;
T25 = Ty - TB;
TC = Ty + TB;
TN = TL - TM;
T1x = TL + TM;
T1H = T1k - T1j;
T1l = T1j + T1k;
}
{
E Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
{
E Tn, To, T1d, T1e;
Tn = ri[WS(is, 15)];
To = ri[WS(is, 7)];
Tp = Tn + To;
T17 = Tn - To;
T1d = ii[WS(is, 15)];
T1e = ii[WS(is, 7)];
T1f = T1d - T1e;
T20 = T1d + T1e;
}
{
E Tq, Tr, T18, T19;
Tq = ri[WS(is, 3)];
Tr = ri[WS(is, 11)];
Ts = Tq + Tr;
T1c = Tq - Tr;
T18 = ii[WS(is, 3)];
T19 = ii[WS(is, 11)];
T1a = T18 - T19;
T21 = T18 + T19;
}
Tt = Tp + Ts;
T22 = T20 - T21;
T2h = T20 + T21;
T1b = T17 - T1a;
T1g = T1c + T1f;
T1E = T1f - T1c;
T1Z = Tp - Ts;
T1D = T17 + T1a;
}
{
E Ta, TP, TF, TO, Td, TR, TI, TS;
{
E T8, T9, TD, TE;
T8 = ri[WS(is, 2)];
T9 = ri[WS(is, 10)];
Ta = T8 + T9;
TP = T8 - T9;
TD = ii[WS(is, 2)];
TE = ii[WS(is, 10)];
TF = TD + TE;
TO = TD - TE;
}
{
E Tb, Tc, TG, TH;
Tb = ri[WS(is, 14)];
Tc = ri[WS(is, 6)];
Td = Tb + Tc;
TR = Tb - Tc;
TG = ii[WS(is, 14)];
TH = ii[WS(is, 6)];
TI = TG + TH;
TS = TG - TH;
}
Te = Ta + Td;
T1S = TF - TI;
T26 = Td - Ta;
TJ = TF + TI;
TQ = TO - TP;
T1m = TR - TS;
T1n = TP + TO;
TT = TR + TS;
}
{
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
{
E Tg, Th, TX, TY;
Tg = ri[WS(is, 1)];
Th = ri[WS(is, 9)];
Ti = Tg + Th;
T11 = Tg - Th;
TX = ii[WS(is, 1)];
TY = ii[WS(is, 9)];
TZ = TX - TY;
T1V = TX + TY;
}
{
E Tj, Tk, T12, T13;
Tj = ri[WS(is, 5)];
Tk = ri[WS(is, 13)];
Tl = Tj + Tk;
TW = Tj - Tk;
T12 = ii[WS(is, 5)];
T13 = ii[WS(is, 13)];
T14 = T12 - T13;
T1W = T12 + T13;
}
Tm = Ti + Tl;
T1X = T1V - T1W;
T2g = T1V + T1W;
T10 = TW + TZ;
T15 = T11 - T14;
T1B = T11 + T14;
T1U = Ti - Tl;
T1A = TZ - TW;
}
{
E Tf, Tu, T2j, T2k;
Tf = T7 + Te;
Tu = Tm + Tt;
ro[WS(os, 8)] = Tf - Tu;
ro[0] = Tf + Tu;
T2j = TC + TJ;
T2k = T2g + T2h;
io[WS(os, 8)] = T2j - T2k;
io[0] = T2j + T2k;
}
{
E Tv, TK, T2f, T2i;
Tv = Tt - Tm;
TK = TC - TJ;
io[WS(os, 4)] = Tv + TK;
io[WS(os, 12)] = TK - Tv;
T2f = T7 - Te;
T2i = T2g - T2h;
ro[WS(os, 12)] = T2f - T2i;
ro[WS(os, 4)] = T2f + T2i;
}
{
E T1T, T27, T24, T28, T1Y, T23;
T1T = T1R + T1S;
T27 = T25 - T26;
T1Y = T1U + T1X;
T23 = T1Z - T22;
T24 = KP707106781 * (T1Y + T23);
T28 = KP707106781 * (T23 - T1Y);
ro[WS(os, 10)] = T1T - T24;
io[WS(os, 6)] = T27 + T28;
ro[WS(os, 2)] = T1T + T24;
io[WS(os, 14)] = T27 - T28;
}
{
E T29, T2d, T2c, T2e, T2a, T2b;
T29 = T1R - T1S;
T2d = T26 + T25;
T2a = T1X - T1U;
T2b = T1Z + T22;
T2c = KP707106781 * (T2a - T2b);
T2e = KP707106781 * (T2a + T2b);
ro[WS(os, 14)] = T29 - T2c;
io[WS(os, 2)] = T2d + T2e;
ro[WS(os, 6)] = T29 + T2c;
io[WS(os, 10)] = T2d - T2e;
}
{
E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
TU = KP707106781 * (TQ - TT);
TV = TN + TU;
T1r = TN - TU;
T1o = KP707106781 * (T1m - T1n);
T1p = T1l - T1o;
T1v = T1l + T1o;
{
E T16, T1h, T1s, T1t;
T16 = FMA(KP923879532, T10, KP382683432 * T15);
T1h = FNMS(KP923879532, T1g, KP382683432 * T1b);
T1i = T16 + T1h;
T1q = T1h - T16;
T1s = FNMS(KP923879532, T15, KP382683432 * T10);
T1t = FMA(KP382683432, T1g, KP923879532 * T1b);
T1u = T1s - T1t;
T1w = T1s + T1t;
}
ro[WS(os, 11)] = TV - T1i;
io[WS(os, 11)] = T1v - T1w;
ro[WS(os, 3)] = TV + T1i;
io[WS(os, 3)] = T1v + T1w;
io[WS(os, 15)] = T1p - T1q;
ro[WS(os, 15)] = T1r - T1u;
io[WS(os, 7)] = T1p + T1q;
ro[WS(os, 7)] = T1r + T1u;
}
{
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
T1y = KP707106781 * (T1n + T1m);
T1z = T1x + T1y;
T1L = T1x - T1y;
T1I = KP707106781 * (TQ + TT);
T1J = T1H - T1I;
T1P = T1H + T1I;
{
E T1C, T1F, T1M, T1N;
T1C = FMA(KP382683432, T1A, KP923879532 * T1B);
T1F = FNMS(KP382683432, T1E, KP923879532 * T1D);
T1G = T1C + T1F;
T1K = T1F - T1C;
T1M = FNMS(KP382683432, T1B, KP923879532 * T1A);
T1N = FMA(KP923879532, T1E, KP382683432 * T1D);
T1O = T1M - T1N;
T1Q = T1M + T1N;
}
ro[WS(os, 9)] = T1z - T1G;
io[WS(os, 9)] = T1P - T1Q;
ro[WS(os, 1)] = T1z + T1G;
io[WS(os, 1)] = T1P + T1Q;
io[WS(os, 13)] = T1J - T1K;
ro[WS(os, 13)] = T1L - T1O;
io[WS(os, 5)] = T1J + T1K;
ro[WS(os, 5)] = T1L + T1O;
}
}
}
}
static const kdft_desc desc = { 16, "n1_16", { 136, 16, 8, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
}
#endif
+94
View File
@@ -0,0 +1,94 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 5 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 1)];
ro[WS(os, 1)] = T1 - T2;
ro[0] = T1 + T2;
T3 = ii[0];
T4 = ii[WS(is, 1)];
io[WS(os, 1)] = T3 - T4;
io[0] = T3 + T4;
}
}
}
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 5 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 1)];
ro[WS(os, 1)] = T1 - T2;
ro[0] = T1 + T2;
T3 = ii[0];
T4 = ii[WS(is, 1)];
io[WS(os, 1)] = T3 - T4;
io[0] = T3 + T4;
}
}
}
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
}
#endif
+718
View File
@@ -0,0 +1,718 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
/*
* This function contains 208 FP additions, 72 FP multiplications,
* (or, 136 additions, 0 multiplications, 72 fused multiply/add),
* 81 stack variables, 4 constants, and 80 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
E T7, T2N, T3b, TD, TP, T1R, T2f, T1d, Tt, TA, TB, T2w, T2z, T2P, T35;
E T36, T3d, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1T, T29, T2a, T2h, T1h;
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2O, T32, T33, T3c, TE, TF, TG, TU;
E TZ, T10, T1D, T1I, T1S, T26, T27, T2g, T1e, T1f, T1g;
{
E T3, T1N, TN, T2L, T6, TO, T1Q, T2M;
{
E T1, T2, TL, TM;
T1 = ri[0];
T2 = ri[WS(is, 10)];
T3 = T1 + T2;
T1N = T1 - T2;
TL = ii[0];
TM = ii[WS(is, 10)];
TN = TL - TM;
T2L = TL + TM;
}
{
E T4, T5, T1O, T1P;
T4 = ri[WS(is, 5)];
T5 = ri[WS(is, 15)];
T6 = T4 + T5;
TO = T4 - T5;
T1O = ii[WS(is, 5)];
T1P = ii[WS(is, 15)];
T1Q = T1O - T1P;
T2M = T1O + T1P;
}
T7 = T3 - T6;
T2N = T2L - T2M;
T3b = T2L + T2M;
TD = T3 + T6;
TP = TN - TO;
T1R = T1N - T1Q;
T2f = T1N + T1Q;
T1d = TO + TN;
}
{
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
E T2y;
{
E Tn, To, T11, T12;
Tn = ri[WS(is, 8)];
To = ri[WS(is, 18)];
Tp = Tn + To;
T1o = Tn - To;
T11 = ii[WS(is, 8)];
T12 = ii[WS(is, 18)];
T13 = T11 - T12;
T2u = T11 + T12;
}
{
E Tq, Tr, T1p, T1q;
Tq = ri[WS(is, 13)];
Tr = ri[WS(is, 3)];
Ts = Tq + Tr;
T14 = Tq - Tr;
T1p = ii[WS(is, 13)];
T1q = ii[WS(is, 3)];
T1r = T1p - T1q;
T2v = T1p + T1q;
}
{
E Tu, Tv, T16, T17;
Tu = ri[WS(is, 12)];
Tv = ri[WS(is, 2)];
Tw = Tu + Tv;
T1t = Tu - Tv;
T16 = ii[WS(is, 12)];
T17 = ii[WS(is, 2)];
T18 = T16 - T17;
T2x = T16 + T17;
}
{
E Tx, Ty, T1u, T1v;
Tx = ri[WS(is, 17)];
Ty = ri[WS(is, 7)];
Tz = Tx + Ty;
T19 = Tx - Ty;
T1u = ii[WS(is, 17)];
T1v = ii[WS(is, 7)];
T1w = T1u - T1v;
T2y = T1u + T1v;
}
Tt = Tp - Ts;
TA = Tw - Tz;
TB = Tt + TA;
T2w = T2u - T2v;
T2z = T2x - T2y;
T2P = T2w + T2z;
T35 = T2u + T2v;
T36 = T2x + T2y;
T3d = T35 + T36;
TH = Tp + Ts;
TI = Tw + Tz;
TJ = TH + TI;
T15 = T13 - T14;
T1a = T18 - T19;
T1b = T15 + T1a;
T1s = T1o - T1r;
T1x = T1t - T1w;
T1T = T1s + T1x;
T29 = T1o + T1r;
T2a = T1t + T1w;
T2h = T29 + T2a;
T1h = T14 + T13;
T1i = T19 + T18;
T1j = T1h + T1i;
}
{
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
E T2F;
{
E T8, T9, TQ, TR;
T8 = ri[WS(is, 4)];
T9 = ri[WS(is, 14)];
Ta = T8 + T9;
T1z = T8 - T9;
TQ = ii[WS(is, 4)];
TR = ii[WS(is, 14)];
TS = TQ - TR;
T2B = TQ + TR;
}
{
E Tb, Tc, T1A, T1B;
Tb = ri[WS(is, 9)];
Tc = ri[WS(is, 19)];
Td = Tb + Tc;
TT = Tb - Tc;
T1A = ii[WS(is, 9)];
T1B = ii[WS(is, 19)];
T1C = T1A - T1B;
T2C = T1A + T1B;
}
{
E Tf, Tg, TV, TW;
Tf = ri[WS(is, 16)];
Tg = ri[WS(is, 6)];
Th = Tf + Tg;
T1E = Tf - Tg;
TV = ii[WS(is, 16)];
TW = ii[WS(is, 6)];
TX = TV - TW;
T2E = TV + TW;
}
{
E Ti, Tj, T1F, T1G;
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 11)];
Tk = Ti + Tj;
TY = Ti - Tj;
T1F = ii[WS(is, 1)];
T1G = ii[WS(is, 11)];
T1H = T1F - T1G;
T2F = T1F + T1G;
}
Te = Ta - Td;
Tl = Th - Tk;
Tm = Te + Tl;
T2D = T2B - T2C;
T2G = T2E - T2F;
T2O = T2D + T2G;
T32 = T2B + T2C;
T33 = T2E + T2F;
T3c = T32 + T33;
TE = Ta + Td;
TF = Th + Tk;
TG = TE + TF;
TU = TS - TT;
TZ = TX - TY;
T10 = TU + TZ;
T1D = T1z - T1C;
T1I = T1E - T1H;
T1S = T1D + T1I;
T26 = T1z + T1C;
T27 = T1E + T1H;
T2g = T26 + T27;
T1e = TT + TS;
T1f = TY + TX;
T1g = T1e + T1f;
}
{
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
T2s = Tm - TB;
TC = Tm + TB;
T2r = FNMS(KP250000000, TC, T7);
T2A = T2w - T2z;
T2H = T2D - T2G;
T2I = FNMS(KP618033988, T2H, T2A);
T2K = FMA(KP618033988, T2A, T2H);
ro[WS(os, 10)] = T7 + TC;
T2J = FMA(KP559016994, T2s, T2r);
ro[WS(os, 14)] = FNMS(KP951056516, T2K, T2J);
ro[WS(os, 6)] = FMA(KP951056516, T2K, T2J);
T2t = FNMS(KP559016994, T2s, T2r);
ro[WS(os, 2)] = FNMS(KP951056516, T2I, T2t);
ro[WS(os, 18)] = FMA(KP951056516, T2I, T2t);
}
{
E T2S, T2Q, T2R, T2W, T2Y, T2U, T2V, T2X, T2T;
T2S = T2O - T2P;
T2Q = T2O + T2P;
T2R = FNMS(KP250000000, T2Q, T2N);
T2U = Tt - TA;
T2V = Te - Tl;
T2W = FNMS(KP618033988, T2V, T2U);
T2Y = FMA(KP618033988, T2U, T2V);
io[WS(os, 10)] = T2N + T2Q;
T2X = FMA(KP559016994, T2S, T2R);
io[WS(os, 6)] = FNMS(KP951056516, T2Y, T2X);
io[WS(os, 14)] = FMA(KP951056516, T2Y, T2X);
T2T = FNMS(KP559016994, T2S, T2R);
io[WS(os, 2)] = FMA(KP951056516, T2W, T2T);
io[WS(os, 18)] = FNMS(KP951056516, T2W, T2T);
}
{
E T30, TK, T2Z, T38, T3a, T34, T37, T39, T31;
T30 = TG - TJ;
TK = TG + TJ;
T2Z = FNMS(KP250000000, TK, TD);
T34 = T32 - T33;
T37 = T35 - T36;
T38 = FMA(KP618033988, T37, T34);
T3a = FNMS(KP618033988, T34, T37);
ro[0] = TD + TK;
T39 = FNMS(KP559016994, T30, T2Z);
ro[WS(os, 12)] = FNMS(KP951056516, T3a, T39);
ro[WS(os, 8)] = FMA(KP951056516, T3a, T39);
T31 = FMA(KP559016994, T30, T2Z);
ro[WS(os, 4)] = FNMS(KP951056516, T38, T31);
ro[WS(os, 16)] = FMA(KP951056516, T38, T31);
}
{
E T3g, T3e, T3f, T3k, T3m, T3i, T3j, T3l, T3h;
T3g = T3c - T3d;
T3e = T3c + T3d;
T3f = FNMS(KP250000000, T3e, T3b);
T3i = TE - TF;
T3j = TH - TI;
T3k = FMA(KP618033988, T3j, T3i);
T3m = FNMS(KP618033988, T3i, T3j);
io[0] = T3b + T3e;
T3l = FNMS(KP559016994, T3g, T3f);
io[WS(os, 8)] = FNMS(KP951056516, T3m, T3l);
io[WS(os, 12)] = FMA(KP951056516, T3m, T3l);
T3h = FMA(KP559016994, T3g, T3f);
io[WS(os, 4)] = FMA(KP951056516, T3k, T3h);
io[WS(os, 16)] = FNMS(KP951056516, T3k, T3h);
}
{
E T24, T1c, T23, T2c, T2e, T28, T2b, T2d, T25;
T24 = T10 - T1b;
T1c = T10 + T1b;
T23 = FNMS(KP250000000, T1c, TP);
T28 = T26 - T27;
T2b = T29 - T2a;
T2c = FMA(KP618033988, T2b, T28);
T2e = FNMS(KP618033988, T28, T2b);
io[WS(os, 5)] = TP + T1c;
T2d = FNMS(KP559016994, T24, T23);
io[WS(os, 13)] = FNMS(KP951056516, T2e, T2d);
io[WS(os, 17)] = FMA(KP951056516, T2e, T2d);
T25 = FMA(KP559016994, T24, T23);
io[WS(os, 1)] = FNMS(KP951056516, T2c, T25);
io[WS(os, 9)] = FMA(KP951056516, T2c, T25);
}
{
E T2k, T2i, T2j, T2o, T2q, T2m, T2n, T2p, T2l;
T2k = T2g - T2h;
T2i = T2g + T2h;
T2j = FNMS(KP250000000, T2i, T2f);
T2m = TU - TZ;
T2n = T15 - T1a;
T2o = FMA(KP618033988, T2n, T2m);
T2q = FNMS(KP618033988, T2m, T2n);
ro[WS(os, 5)] = T2f + T2i;
T2p = FNMS(KP559016994, T2k, T2j);
ro[WS(os, 13)] = FMA(KP951056516, T2q, T2p);
ro[WS(os, 17)] = FNMS(KP951056516, T2q, T2p);
T2l = FMA(KP559016994, T2k, T2j);
ro[WS(os, 1)] = FMA(KP951056516, T2o, T2l);
ro[WS(os, 9)] = FNMS(KP951056516, T2o, T2l);
}
{
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
T1m = T1g - T1j;
T1k = T1g + T1j;
T1l = FNMS(KP250000000, T1k, T1d);
T1y = T1s - T1x;
T1J = T1D - T1I;
T1K = FNMS(KP618033988, T1J, T1y);
T1M = FMA(KP618033988, T1y, T1J);
io[WS(os, 15)] = T1d + T1k;
T1L = FMA(KP559016994, T1m, T1l);
io[WS(os, 11)] = FNMS(KP951056516, T1M, T1L);
io[WS(os, 19)] = FMA(KP951056516, T1M, T1L);
T1n = FNMS(KP559016994, T1m, T1l);
io[WS(os, 3)] = FNMS(KP951056516, T1K, T1n);
io[WS(os, 7)] = FMA(KP951056516, T1K, T1n);
}
{
E T1W, T1U, T1V, T20, T22, T1Y, T1Z, T21, T1X;
T1W = T1S - T1T;
T1U = T1S + T1T;
T1V = FNMS(KP250000000, T1U, T1R);
T1Y = T1h - T1i;
T1Z = T1e - T1f;
T20 = FNMS(KP618033988, T1Z, T1Y);
T22 = FMA(KP618033988, T1Y, T1Z);
ro[WS(os, 15)] = T1R + T1U;
T21 = FMA(KP559016994, T1W, T1V);
ro[WS(os, 11)] = FMA(KP951056516, T22, T21);
ro[WS(os, 19)] = FNMS(KP951056516, T22, T21);
T1X = FNMS(KP559016994, T1W, T1V);
ro[WS(os, 3)] = FMA(KP951056516, T20, T1X);
ro[WS(os, 7)] = FNMS(KP951056516, T20, T1X);
}
}
}
}
static const kdft_desc desc = { 20, "n1_20", { 136, 0, 72, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
/*
* This function contains 208 FP additions, 48 FP multiplications,
* (or, 184 additions, 24 multiplications, 24 fused multiply/add),
* 81 stack variables, 4 constants, and 80 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
E T7, T2Q, T3h, TD, TP, T1U, T2l, T1d, Tt, TA, TB, T2w, T2z, T2S, T35;
E T36, T3f, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1W, T29, T2a, T2j, T1h;
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2R, T32, T33, T3e, TE, TF, TG, TU;
E TZ, T10, T1D, T1I, T1V, T26, T27, T2i, T1e, T1f, T1g;
{
E T3, T1Q, TN, T2O, T6, TO, T1T, T2P;
{
E T1, T2, TL, TM;
T1 = ri[0];
T2 = ri[WS(is, 10)];
T3 = T1 + T2;
T1Q = T1 - T2;
TL = ii[0];
TM = ii[WS(is, 10)];
TN = TL - TM;
T2O = TL + TM;
}
{
E T4, T5, T1R, T1S;
T4 = ri[WS(is, 5)];
T5 = ri[WS(is, 15)];
T6 = T4 + T5;
TO = T4 - T5;
T1R = ii[WS(is, 5)];
T1S = ii[WS(is, 15)];
T1T = T1R - T1S;
T2P = T1R + T1S;
}
T7 = T3 - T6;
T2Q = T2O - T2P;
T3h = T2O + T2P;
TD = T3 + T6;
TP = TN - TO;
T1U = T1Q - T1T;
T2l = T1Q + T1T;
T1d = TO + TN;
}
{
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
E T2y;
{
E Tn, To, T11, T12;
Tn = ri[WS(is, 8)];
To = ri[WS(is, 18)];
Tp = Tn + To;
T1o = Tn - To;
T11 = ii[WS(is, 8)];
T12 = ii[WS(is, 18)];
T13 = T11 - T12;
T2u = T11 + T12;
}
{
E Tq, Tr, T1p, T1q;
Tq = ri[WS(is, 13)];
Tr = ri[WS(is, 3)];
Ts = Tq + Tr;
T14 = Tq - Tr;
T1p = ii[WS(is, 13)];
T1q = ii[WS(is, 3)];
T1r = T1p - T1q;
T2v = T1p + T1q;
}
{
E Tu, Tv, T16, T17;
Tu = ri[WS(is, 12)];
Tv = ri[WS(is, 2)];
Tw = Tu + Tv;
T1t = Tu - Tv;
T16 = ii[WS(is, 12)];
T17 = ii[WS(is, 2)];
T18 = T16 - T17;
T2x = T16 + T17;
}
{
E Tx, Ty, T1u, T1v;
Tx = ri[WS(is, 17)];
Ty = ri[WS(is, 7)];
Tz = Tx + Ty;
T19 = Tx - Ty;
T1u = ii[WS(is, 17)];
T1v = ii[WS(is, 7)];
T1w = T1u - T1v;
T2y = T1u + T1v;
}
Tt = Tp - Ts;
TA = Tw - Tz;
TB = Tt + TA;
T2w = T2u - T2v;
T2z = T2x - T2y;
T2S = T2w + T2z;
T35 = T2u + T2v;
T36 = T2x + T2y;
T3f = T35 + T36;
TH = Tp + Ts;
TI = Tw + Tz;
TJ = TH + TI;
T15 = T13 - T14;
T1a = T18 - T19;
T1b = T15 + T1a;
T1s = T1o - T1r;
T1x = T1t - T1w;
T1W = T1s + T1x;
T29 = T1o + T1r;
T2a = T1t + T1w;
T2j = T29 + T2a;
T1h = T14 + T13;
T1i = T19 + T18;
T1j = T1h + T1i;
}
{
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
E T2F;
{
E T8, T9, TQ, TR;
T8 = ri[WS(is, 4)];
T9 = ri[WS(is, 14)];
Ta = T8 + T9;
T1z = T8 - T9;
TQ = ii[WS(is, 4)];
TR = ii[WS(is, 14)];
TS = TQ - TR;
T2B = TQ + TR;
}
{
E Tb, Tc, T1A, T1B;
Tb = ri[WS(is, 9)];
Tc = ri[WS(is, 19)];
Td = Tb + Tc;
TT = Tb - Tc;
T1A = ii[WS(is, 9)];
T1B = ii[WS(is, 19)];
T1C = T1A - T1B;
T2C = T1A + T1B;
}
{
E Tf, Tg, TV, TW;
Tf = ri[WS(is, 16)];
Tg = ri[WS(is, 6)];
Th = Tf + Tg;
T1E = Tf - Tg;
TV = ii[WS(is, 16)];
TW = ii[WS(is, 6)];
TX = TV - TW;
T2E = TV + TW;
}
{
E Ti, Tj, T1F, T1G;
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 11)];
Tk = Ti + Tj;
TY = Ti - Tj;
T1F = ii[WS(is, 1)];
T1G = ii[WS(is, 11)];
T1H = T1F - T1G;
T2F = T1F + T1G;
}
Te = Ta - Td;
Tl = Th - Tk;
Tm = Te + Tl;
T2D = T2B - T2C;
T2G = T2E - T2F;
T2R = T2D + T2G;
T32 = T2B + T2C;
T33 = T2E + T2F;
T3e = T32 + T33;
TE = Ta + Td;
TF = Th + Tk;
TG = TE + TF;
TU = TS - TT;
TZ = TX - TY;
T10 = TU + TZ;
T1D = T1z - T1C;
T1I = T1E - T1H;
T1V = T1D + T1I;
T26 = T1z + T1C;
T27 = T1E + T1H;
T2i = T26 + T27;
T1e = TT + TS;
T1f = TY + TX;
T1g = T1e + T1f;
}
{
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
T2s = KP559016994 * (Tm - TB);
TC = Tm + TB;
T2r = FNMS(KP250000000, TC, T7);
T2A = T2w - T2z;
T2H = T2D - T2G;
T2I = FNMS(KP587785252, T2H, KP951056516 * T2A);
T2K = FMA(KP951056516, T2H, KP587785252 * T2A);
ro[WS(os, 10)] = T7 + TC;
T2J = T2s + T2r;
ro[WS(os, 14)] = T2J - T2K;
ro[WS(os, 6)] = T2J + T2K;
T2t = T2r - T2s;
ro[WS(os, 2)] = T2t - T2I;
ro[WS(os, 18)] = T2t + T2I;
}
{
E T2V, T2T, T2U, T2N, T2Y, T2L, T2M, T2X, T2W;
T2V = KP559016994 * (T2R - T2S);
T2T = T2R + T2S;
T2U = FNMS(KP250000000, T2T, T2Q);
T2L = Tt - TA;
T2M = Te - Tl;
T2N = FNMS(KP587785252, T2M, KP951056516 * T2L);
T2Y = FMA(KP951056516, T2M, KP587785252 * T2L);
io[WS(os, 10)] = T2Q + T2T;
T2X = T2V + T2U;
io[WS(os, 6)] = T2X - T2Y;
io[WS(os, 14)] = T2Y + T2X;
T2W = T2U - T2V;
io[WS(os, 2)] = T2N + T2W;
io[WS(os, 18)] = T2W - T2N;
}
{
E T2Z, TK, T30, T38, T3a, T34, T37, T39, T31;
T2Z = KP559016994 * (TG - TJ);
TK = TG + TJ;
T30 = FNMS(KP250000000, TK, TD);
T34 = T32 - T33;
T37 = T35 - T36;
T38 = FMA(KP951056516, T34, KP587785252 * T37);
T3a = FNMS(KP587785252, T34, KP951056516 * T37);
ro[0] = TD + TK;
T39 = T30 - T2Z;
ro[WS(os, 12)] = T39 - T3a;
ro[WS(os, 8)] = T39 + T3a;
T31 = T2Z + T30;
ro[WS(os, 4)] = T31 - T38;
ro[WS(os, 16)] = T31 + T38;
}
{
E T3g, T3i, T3j, T3d, T3m, T3b, T3c, T3l, T3k;
T3g = KP559016994 * (T3e - T3f);
T3i = T3e + T3f;
T3j = FNMS(KP250000000, T3i, T3h);
T3b = TE - TF;
T3c = TH - TI;
T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
T3m = FNMS(KP587785252, T3b, KP951056516 * T3c);
io[0] = T3h + T3i;
T3l = T3j - T3g;
io[WS(os, 8)] = T3l - T3m;
io[WS(os, 12)] = T3m + T3l;
T3k = T3g + T3j;
io[WS(os, 4)] = T3d + T3k;
io[WS(os, 16)] = T3k - T3d;
}
{
E T23, T1c, T24, T2c, T2e, T28, T2b, T2d, T25;
T23 = KP559016994 * (T10 - T1b);
T1c = T10 + T1b;
T24 = FNMS(KP250000000, T1c, TP);
T28 = T26 - T27;
T2b = T29 - T2a;
T2c = FMA(KP951056516, T28, KP587785252 * T2b);
T2e = FNMS(KP587785252, T28, KP951056516 * T2b);
io[WS(os, 5)] = TP + T1c;
T2d = T24 - T23;
io[WS(os, 13)] = T2d - T2e;
io[WS(os, 17)] = T2d + T2e;
T25 = T23 + T24;
io[WS(os, 1)] = T25 - T2c;
io[WS(os, 9)] = T25 + T2c;
}
{
E T2k, T2m, T2n, T2h, T2p, T2f, T2g, T2q, T2o;
T2k = KP559016994 * (T2i - T2j);
T2m = T2i + T2j;
T2n = FNMS(KP250000000, T2m, T2l);
T2f = TU - TZ;
T2g = T15 - T1a;
T2h = FMA(KP951056516, T2f, KP587785252 * T2g);
T2p = FNMS(KP587785252, T2f, KP951056516 * T2g);
ro[WS(os, 5)] = T2l + T2m;
T2q = T2n - T2k;
ro[WS(os, 13)] = T2p + T2q;
ro[WS(os, 17)] = T2q - T2p;
T2o = T2k + T2n;
ro[WS(os, 1)] = T2h + T2o;
ro[WS(os, 9)] = T2o - T2h;
}
{
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
T1m = KP559016994 * (T1g - T1j);
T1k = T1g + T1j;
T1l = FNMS(KP250000000, T1k, T1d);
T1y = T1s - T1x;
T1J = T1D - T1I;
T1K = FNMS(KP587785252, T1J, KP951056516 * T1y);
T1M = FMA(KP951056516, T1J, KP587785252 * T1y);
io[WS(os, 15)] = T1d + T1k;
T1L = T1m + T1l;
io[WS(os, 11)] = T1L - T1M;
io[WS(os, 19)] = T1L + T1M;
T1n = T1l - T1m;
io[WS(os, 3)] = T1n - T1K;
io[WS(os, 7)] = T1n + T1K;
}
{
E T1Z, T1X, T1Y, T1P, T21, T1N, T1O, T22, T20;
T1Z = KP559016994 * (T1V - T1W);
T1X = T1V + T1W;
T1Y = FNMS(KP250000000, T1X, T1U);
T1N = T1h - T1i;
T1O = T1e - T1f;
T1P = FNMS(KP587785252, T1O, KP951056516 * T1N);
T21 = FMA(KP951056516, T1O, KP587785252 * T1N);
ro[WS(os, 15)] = T1U + T1X;
T22 = T1Z + T1Y;
ro[WS(os, 11)] = T21 + T22;
ro[WS(os, 19)] = T22 - T21;
T20 = T1Y - T1Z;
ro[WS(os, 3)] = T1P + T20;
ro[WS(os, 7)] = T20 - T1P;
}
}
}
}
static const kdft_desc desc = { 20, "n1_20", { 184, 24, 24, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
+124
View File
@@ -0,0 +1,124 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
/*
* This function contains 12 FP additions, 6 FP multiplications,
* (or, 6 additions, 0 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
E T1, T9, T4, Tc, T8, Ta, T5, Tb;
T1 = ri[0];
T9 = ii[0];
{
E T2, T3, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 2)];
T4 = T2 + T3;
Tc = T3 - T2;
T6 = ii[WS(is, 1)];
T7 = ii[WS(is, 2)];
T8 = T6 - T7;
Ta = T6 + T7;
}
ro[0] = T1 + T4;
io[0] = T9 + Ta;
T5 = FNMS(KP500000000, T4, T1);
ro[WS(os, 2)] = FNMS(KP866025403, T8, T5);
ro[WS(os, 1)] = FMA(KP866025403, T8, T5);
Tb = FNMS(KP500000000, Ta, T9);
io[WS(os, 1)] = FMA(KP866025403, Tc, Tb);
io[WS(os, 2)] = FNMS(KP866025403, Tc, Tb);
}
}
}
static const kdft_desc desc = { 3, "n1_3", { 6, 0, 6, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
/*
* This function contains 12 FP additions, 4 FP multiplications,
* (or, 10 additions, 2 multiplications, 2 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
E T1, Ta, T4, T9, T8, Tb, T5, Tc;
T1 = ri[0];
Ta = ii[0];
{
E T2, T3, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 2)];
T4 = T2 + T3;
T9 = KP866025403 * (T3 - T2);
T6 = ii[WS(is, 1)];
T7 = ii[WS(is, 2)];
T8 = KP866025403 * (T6 - T7);
Tb = T6 + T7;
}
ro[0] = T1 + T4;
io[0] = Ta + Tb;
T5 = FNMS(KP500000000, T4, T1);
ro[WS(os, 2)] = T5 - T8;
ro[WS(os, 1)] = T5 + T8;
Tc = FNMS(KP500000000, Tb, Ta);
io[WS(os, 1)] = T9 + Tc;
io[WS(os, 2)] = Tc - T9;
}
}
}
static const kdft_desc desc = { 3, "n1_3", { 10, 2, 2, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
+138
View File
@@ -0,0 +1,138 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
{
E T1, T2, T7, T8;
T1 = ri[0];
T2 = ri[WS(is, 2)];
T3 = T1 + T2;
Tb = T1 - T2;
T7 = ii[0];
T8 = ii[WS(is, 2)];
T9 = T7 - T8;
Tf = T7 + T8;
}
{
E T4, T5, Tc, Td;
T4 = ri[WS(is, 1)];
T5 = ri[WS(is, 3)];
T6 = T4 + T5;
Ta = T4 - T5;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 3)];
Te = Tc - Td;
Tg = Tc + Td;
}
ro[WS(os, 2)] = T3 - T6;
io[WS(os, 2)] = Tf - Tg;
ro[0] = T3 + T6;
io[0] = Tf + Tg;
io[WS(os, 1)] = T9 - Ta;
ro[WS(os, 1)] = Tb + Te;
io[WS(os, 3)] = Ta + T9;
ro[WS(os, 3)] = Tb - Te;
}
}
}
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
{
E T1, T2, T7, T8;
T1 = ri[0];
T2 = ri[WS(is, 2)];
T3 = T1 + T2;
Tb = T1 - T2;
T7 = ii[0];
T8 = ii[WS(is, 2)];
T9 = T7 - T8;
Tf = T7 + T8;
}
{
E T4, T5, Tc, Td;
T4 = ri[WS(is, 1)];
T5 = ri[WS(is, 3)];
T6 = T4 + T5;
Ta = T4 - T5;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 3)];
Te = Tc - Td;
Tg = Tc + Td;
}
ro[WS(os, 2)] = T3 - T6;
io[WS(os, 2)] = Tf - Tg;
ro[0] = T3 + T6;
io[0] = Tf + Tg;
io[WS(os, 1)] = T9 - Ta;
ro[WS(os, 1)] = Tb + Te;
io[WS(os, 3)] = Ta + T9;
ro[WS(os, 3)] = Tb - Te;
}
}
}
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
}
#endif
+194
View File
@@ -0,0 +1,194 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 21 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
E T1, Tl, T8, Tt, Ta, Ts, Te, Tq, Th, To;
T1 = ri[0];
Tl = ii[0];
{
E T2, T3, T4, T5, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 4)];
T4 = T2 + T3;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 3)];
T7 = T5 + T6;
T8 = T4 + T7;
Tt = T5 - T6;
Ta = T4 - T7;
Ts = T2 - T3;
}
{
E Tc, Td, Tm, Tf, Tg, Tn;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 4)];
Tm = Tc + Td;
Tf = ii[WS(is, 2)];
Tg = ii[WS(is, 3)];
Tn = Tf + Tg;
Te = Tc - Td;
Tq = Tm - Tn;
Th = Tf - Tg;
To = Tm + Tn;
}
ro[0] = T1 + T8;
io[0] = Tl + To;
{
E Ti, Tk, Tb, Tj, T9;
Ti = FMA(KP618033988, Th, Te);
Tk = FNMS(KP618033988, Te, Th);
T9 = FNMS(KP250000000, T8, T1);
Tb = FMA(KP559016994, Ta, T9);
Tj = FNMS(KP559016994, Ta, T9);
ro[WS(os, 4)] = FNMS(KP951056516, Ti, Tb);
ro[WS(os, 3)] = FMA(KP951056516, Tk, Tj);
ro[WS(os, 1)] = FMA(KP951056516, Ti, Tb);
ro[WS(os, 2)] = FNMS(KP951056516, Tk, Tj);
}
{
E Tu, Tw, Tr, Tv, Tp;
Tu = FMA(KP618033988, Tt, Ts);
Tw = FNMS(KP618033988, Ts, Tt);
Tp = FNMS(KP250000000, To, Tl);
Tr = FMA(KP559016994, Tq, Tp);
Tv = FNMS(KP559016994, Tq, Tp);
io[WS(os, 1)] = FNMS(KP951056516, Tu, Tr);
io[WS(os, 3)] = FNMS(KP951056516, Tw, Tv);
io[WS(os, 4)] = FMA(KP951056516, Tu, Tr);
io[WS(os, 2)] = FMA(KP951056516, Tw, Tv);
}
}
}
}
static const kdft_desc desc = { 5, "n1_5", { 14, 0, 18, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
/*
* This function contains 32 FP additions, 12 FP multiplications,
* (or, 26 additions, 6 multiplications, 6 fused multiply/add),
* 21 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
E T1, To, T8, Tt, T9, Ts, Te, Tp, Th, Tn;
T1 = ri[0];
To = ii[0];
{
E T2, T3, T4, T5, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 4)];
T4 = T2 + T3;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 3)];
T7 = T5 + T6;
T8 = T4 + T7;
Tt = T5 - T6;
T9 = KP559016994 * (T4 - T7);
Ts = T2 - T3;
}
{
E Tc, Td, Tl, Tf, Tg, Tm;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 4)];
Tl = Tc + Td;
Tf = ii[WS(is, 2)];
Tg = ii[WS(is, 3)];
Tm = Tf + Tg;
Te = Tc - Td;
Tp = Tl + Tm;
Th = Tf - Tg;
Tn = KP559016994 * (Tl - Tm);
}
ro[0] = T1 + T8;
io[0] = To + Tp;
{
E Ti, Tk, Tb, Tj, Ta;
Ti = FMA(KP951056516, Te, KP587785252 * Th);
Tk = FNMS(KP587785252, Te, KP951056516 * Th);
Ta = FNMS(KP250000000, T8, T1);
Tb = T9 + Ta;
Tj = Ta - T9;
ro[WS(os, 4)] = Tb - Ti;
ro[WS(os, 3)] = Tj + Tk;
ro[WS(os, 1)] = Tb + Ti;
ro[WS(os, 2)] = Tj - Tk;
}
{
E Tu, Tv, Tr, Tw, Tq;
Tu = FMA(KP951056516, Ts, KP587785252 * Tt);
Tv = FNMS(KP587785252, Ts, KP951056516 * Tt);
Tq = FNMS(KP250000000, Tp, To);
Tr = Tn + Tq;
Tw = Tq - Tn;
io[WS(os, 1)] = Tr - Tu;
io[WS(os, 3)] = Tw - Tv;
io[WS(os, 4)] = Tu + Tr;
io[WS(os, 2)] = Tv + Tw;
}
}
}
}
static const kdft_desc desc = { 5, "n1_5", { 26, 6, 6, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
}
#endif
+210
View File
@@ -0,0 +1,210 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
/*
* This function contains 36 FP additions, 12 FP multiplications,
* (or, 24 additions, 0 multiplications, 12 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
E T3, Tb, Tp, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tq;
E Ty;
{
E T1, T2, Tn, To;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = T1 - T2;
Tb = T1 + T2;
Tn = ii[0];
To = ii[WS(is, 3)];
Tp = Tn - To;
Tx = Tn + To;
}
{
E T4, T5, T7, T8;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 5)];
T6 = T4 - T5;
Tc = T4 + T5;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 1)];
T9 = T7 - T8;
Td = T7 + T8;
}
Ta = T6 + T9;
Te = Tc + Td;
{
E Tg, Th, Tj, Tk;
Tg = ii[WS(is, 2)];
Th = ii[WS(is, 5)];
Ti = Tg - Th;
Tu = Tg + Th;
Tj = ii[WS(is, 4)];
Tk = ii[WS(is, 1)];
Tl = Tj - Tk;
Tv = Tj + Tk;
}
Tq = Ti + Tl;
Ty = Tu + Tv;
ro[WS(os, 3)] = T3 + Ta;
io[WS(os, 3)] = Tp + Tq;
ro[0] = Tb + Te;
io[0] = Tx + Ty;
{
E Tf, Tm, Tr, Ts;
Tf = FNMS(KP500000000, Ta, T3);
Tm = Ti - Tl;
ro[WS(os, 5)] = FNMS(KP866025403, Tm, Tf);
ro[WS(os, 1)] = FMA(KP866025403, Tm, Tf);
Tr = FNMS(KP500000000, Tq, Tp);
Ts = T9 - T6;
io[WS(os, 1)] = FMA(KP866025403, Ts, Tr);
io[WS(os, 5)] = FNMS(KP866025403, Ts, Tr);
}
{
E Tt, Tw, Tz, TA;
Tt = FNMS(KP500000000, Te, Tb);
Tw = Tu - Tv;
ro[WS(os, 2)] = FNMS(KP866025403, Tw, Tt);
ro[WS(os, 4)] = FMA(KP866025403, Tw, Tt);
Tz = FNMS(KP500000000, Ty, Tx);
TA = Td - Tc;
io[WS(os, 2)] = FNMS(KP866025403, TA, Tz);
io[WS(os, 4)] = FMA(KP866025403, TA, Tz);
}
}
}
}
static const kdft_desc desc = { 6, "n1_6", { 24, 0, 12, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
/*
* This function contains 36 FP additions, 8 FP multiplications,
* (or, 32 additions, 4 multiplications, 4 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
E T3, Tb, Tq, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tr;
E Ty;
{
E T1, T2, To, Tp;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = T1 - T2;
Tb = T1 + T2;
To = ii[0];
Tp = ii[WS(is, 3)];
Tq = To - Tp;
Tx = To + Tp;
}
{
E T4, T5, T7, T8;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 5)];
T6 = T4 - T5;
Tc = T4 + T5;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 1)];
T9 = T7 - T8;
Td = T7 + T8;
}
Ta = T6 + T9;
Te = Tc + Td;
{
E Tg, Th, Tj, Tk;
Tg = ii[WS(is, 2)];
Th = ii[WS(is, 5)];
Ti = Tg - Th;
Tu = Tg + Th;
Tj = ii[WS(is, 4)];
Tk = ii[WS(is, 1)];
Tl = Tj - Tk;
Tv = Tj + Tk;
}
Tr = Ti + Tl;
Ty = Tu + Tv;
ro[WS(os, 3)] = T3 + Ta;
io[WS(os, 3)] = Tq + Tr;
ro[0] = Tb + Te;
io[0] = Tx + Ty;
{
E Tf, Tm, Tn, Ts;
Tf = FNMS(KP500000000, Ta, T3);
Tm = KP866025403 * (Ti - Tl);
ro[WS(os, 5)] = Tf - Tm;
ro[WS(os, 1)] = Tf + Tm;
Tn = KP866025403 * (T9 - T6);
Ts = FNMS(KP500000000, Tr, Tq);
io[WS(os, 1)] = Tn + Ts;
io[WS(os, 5)] = Ts - Tn;
}
{
E Tt, Tw, Tz, TA;
Tt = FNMS(KP500000000, Te, Tb);
Tw = KP866025403 * (Tu - Tv);
ro[WS(os, 2)] = Tt - Tw;
ro[WS(os, 4)] = Tt + Tw;
Tz = FNMS(KP500000000, Ty, Tx);
TA = KP866025403 * (Td - Tc);
io[WS(os, 2)] = Tz - TA;
io[WS(os, 4)] = TA + Tz;
}
}
}
}
static const kdft_desc desc = { 6, "n1_6", { 32, 4, 4, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
+249
View File
@@ -0,0 +1,249 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
/*
* This function contains 60 FP additions, 42 FP multiplications,
* (or, 18 additions, 0 multiplications, 42 fused multiply/add),
* 41 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
E T1, Tz, T4, TI, Ta, TG, T7, TH, Tb, Tp, TT, TO, TJ, Tu, Tg;
E TB, Tm, TC, Tj, TA, Tn, Ts, TQ, TL, TD, Tx;
T1 = ri[0];
Tz = ii[0];
{
E T2, T3, Te, Tf;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
TI = T3 - T2;
{
E T8, T9, T5, T6;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
TG = T9 - T8;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 5)];
T7 = T5 + T6;
TH = T6 - T5;
}
Tb = FNMS(KP356895867, T7, T4);
Tp = FNMS(KP356895867, T4, Ta);
TT = FMA(KP554958132, TG, TI);
TO = FMA(KP554958132, TH, TG);
TJ = FNMS(KP554958132, TI, TH);
Tu = FNMS(KP356895867, Ta, T7);
Te = ii[WS(is, 2)];
Tf = ii[WS(is, 5)];
Tg = Te - Tf;
TB = Te + Tf;
{
E Tk, Tl, Th, Ti;
Tk = ii[WS(is, 3)];
Tl = ii[WS(is, 4)];
Tm = Tk - Tl;
TC = Tk + Tl;
Th = ii[WS(is, 1)];
Ti = ii[WS(is, 6)];
Tj = Th - Ti;
TA = Th + Ti;
}
Tn = FMA(KP554958132, Tm, Tj);
Ts = FMA(KP554958132, Tg, Tm);
TQ = FNMS(KP356895867, TB, TA);
TL = FNMS(KP356895867, TA, TC);
TD = FNMS(KP356895867, TC, TB);
Tx = FNMS(KP554958132, Tj, Tg);
}
ro[0] = T1 + T4 + T7 + Ta;
io[0] = Tz + TA + TB + TC;
{
E To, Td, Tc, TU, TS, TR;
To = FMA(KP801937735, Tn, Tg);
Tc = FNMS(KP692021471, Tb, Ta);
Td = FNMS(KP900968867, Tc, T1);
ro[WS(os, 6)] = FNMS(KP974927912, To, Td);
ro[WS(os, 1)] = FMA(KP974927912, To, Td);
TU = FMA(KP801937735, TT, TH);
TR = FNMS(KP692021471, TQ, TC);
TS = FNMS(KP900968867, TR, Tz);
io[WS(os, 1)] = FMA(KP974927912, TU, TS);
io[WS(os, 6)] = FNMS(KP974927912, TU, TS);
}
{
E Tt, Tr, Tq, TP, TN, TM;
Tt = FNMS(KP801937735, Ts, Tj);
Tq = FNMS(KP692021471, Tp, T7);
Tr = FNMS(KP900968867, Tq, T1);
ro[WS(os, 5)] = FNMS(KP974927912, Tt, Tr);
ro[WS(os, 2)] = FMA(KP974927912, Tt, Tr);
TP = FNMS(KP801937735, TO, TI);
TM = FNMS(KP692021471, TL, TB);
TN = FNMS(KP900968867, TM, Tz);
io[WS(os, 2)] = FMA(KP974927912, TP, TN);
io[WS(os, 5)] = FNMS(KP974927912, TP, TN);
}
{
E Ty, Tw, Tv, TK, TF, TE;
Ty = FNMS(KP801937735, Tx, Tm);
Tv = FNMS(KP692021471, Tu, T4);
Tw = FNMS(KP900968867, Tv, T1);
ro[WS(os, 4)] = FNMS(KP974927912, Ty, Tw);
ro[WS(os, 3)] = FMA(KP974927912, Ty, Tw);
TK = FNMS(KP801937735, TJ, TG);
TE = FNMS(KP692021471, TD, TA);
TF = FNMS(KP900968867, TE, Tz);
io[WS(os, 3)] = FMA(KP974927912, TK, TF);
io[WS(os, 4)] = FNMS(KP974927912, TK, TF);
}
}
}
}
static const kdft_desc desc = { 7, "n1_7", { 18, 0, 42, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
/*
* This function contains 60 FP additions, 36 FP multiplications,
* (or, 36 additions, 12 multiplications, 24 fused multiply/add),
* 25 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
E T1, Tu, T4, Tq, Te, Tx, T7, Ts, Tk, Tv, Ta, Tr, Th, Tw;
T1 = ri[0];
Tu = ii[0];
{
E T2, T3, Tc, Td;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
Tq = T3 - T2;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 6)];
Te = Tc - Td;
Tx = Tc + Td;
}
{
E T5, T6, Ti, Tj;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 5)];
T7 = T5 + T6;
Ts = T6 - T5;
Ti = ii[WS(is, 2)];
Tj = ii[WS(is, 5)];
Tk = Ti - Tj;
Tv = Ti + Tj;
}
{
E T8, T9, Tf, Tg;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
Tr = T9 - T8;
Tf = ii[WS(is, 3)];
Tg = ii[WS(is, 4)];
Th = Tf - Tg;
Tw = Tf + Tg;
}
ro[0] = T1 + T4 + T7 + Ta;
io[0] = Tu + Tx + Tv + Tw;
{
E Tl, Tb, TB, TC;
Tl = FNMS(KP781831482, Th, KP974927912 * Te) - (KP433883739 * Tk);
Tb = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
ro[WS(os, 5)] = Tb - Tl;
ro[WS(os, 2)] = Tb + Tl;
TB = FNMS(KP781831482, Tr, KP974927912 * Tq) - (KP433883739 * Ts);
TC = FMA(KP623489801, Tw, Tu) + FNMA(KP900968867, Tv, KP222520933 * Tx);
io[WS(os, 2)] = TB + TC;
io[WS(os, 5)] = TC - TB;
}
{
E Tn, Tm, Tz, TA;
Tn = FMA(KP781831482, Te, KP974927912 * Tk) + (KP433883739 * Th);
Tm = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
ro[WS(os, 6)] = Tm - Tn;
ro[WS(os, 1)] = Tm + Tn;
Tz = FMA(KP781831482, Tq, KP974927912 * Ts) + (KP433883739 * Tr);
TA = FMA(KP623489801, Tx, Tu) + FNMA(KP900968867, Tw, KP222520933 * Tv);
io[WS(os, 1)] = Tz + TA;
io[WS(os, 6)] = TA - Tz;
}
{
E Tp, To, Tt, Ty;
Tp = FMA(KP433883739, Te, KP974927912 * Th) - (KP781831482 * Tk);
To = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
ro[WS(os, 4)] = To - Tp;
ro[WS(os, 3)] = To + Tp;
Tt = FMA(KP433883739, Tq, KP974927912 * Tr) - (KP781831482 * Ts);
Ty = FMA(KP623489801, Tv, Tu) + FNMA(KP222520933, Tw, KP900968867 * Tx);
io[WS(os, 3)] = Tt + Ty;
io[WS(os, 4)] = Ty - Tt;
}
}
}
}
static const kdft_desc desc = { 7, "n1_7", { 36, 12, 24, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
}
#endif
+266
View File
@@ -0,0 +1,266 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
E TG;
{
E T1, T2, Tj, Tk;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = T1 + T2;
Tn = T1 - T2;
{
E Tg, Th, T4, T5;
Tg = ii[0];
Th = ii[WS(is, 4)];
Ti = Tg + Th;
TC = Tg - Th;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 6)];
T6 = T4 + T5;
TB = T4 - T5;
}
Tj = ii[WS(is, 2)];
Tk = ii[WS(is, 6)];
Tl = Tj + Tk;
To = Tj - Tk;
{
E Tb, Tc, Tv, Tw, Tx, Ty;
Tb = ri[WS(is, 7)];
Tc = ri[WS(is, 3)];
Tv = Tb - Tc;
Tw = ii[WS(is, 7)];
Tx = ii[WS(is, 3)];
Ty = Tw - Tx;
Td = Tb + Tc;
TN = Tw + Tx;
Tz = Tv - Ty;
TH = Tv + Ty;
}
{
E T8, T9, Tq, Tr, Ts, Tt;
T8 = ri[WS(is, 1)];
T9 = ri[WS(is, 5)];
Tq = T8 - T9;
Tr = ii[WS(is, 1)];
Ts = ii[WS(is, 5)];
Tt = Tr - Ts;
Ta = T8 + T9;
TM = Tr + Ts;
Tu = Tq + Tt;
TG = Tt - Tq;
}
}
{
E T7, Te, TP, TQ;
T7 = T3 + T6;
Te = Ta + Td;
ro[WS(os, 4)] = T7 - Te;
ro[0] = T7 + Te;
TP = Ti + Tl;
TQ = TM + TN;
io[WS(os, 4)] = TP - TQ;
io[0] = TP + TQ;
}
{
E Tf, Tm, TL, TO;
Tf = Td - Ta;
Tm = Ti - Tl;
io[WS(os, 2)] = Tf + Tm;
io[WS(os, 6)] = Tm - Tf;
TL = T3 - T6;
TO = TM - TN;
ro[WS(os, 6)] = TL - TO;
ro[WS(os, 2)] = TL + TO;
}
{
E Tp, TA, TJ, TK;
Tp = Tn + To;
TA = Tu + Tz;
ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
TJ = TC - TB;
TK = TG + TH;
io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
}
{
E TD, TE, TF, TI;
TD = TB + TC;
TE = Tz - Tu;
io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
io[WS(os, 3)] = FMA(KP707106781, TE, TD);
TF = Tn - To;
TI = TG - TH;
ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
}
}
}
}
static const kdft_desc desc = { 8, "n1_8", { 44, 0, 8, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
/*
* This function contains 52 FP additions, 4 FP multiplications,
* (or, 52 additions, 4 multiplications, 0 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
E TG;
{
E T1, T2, Tj, Tk;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = T1 + T2;
Tn = T1 - T2;
{
E Tg, Th, T4, T5;
Tg = ii[0];
Th = ii[WS(is, 4)];
Ti = Tg + Th;
TC = Tg - Th;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 6)];
T6 = T4 + T5;
TB = T4 - T5;
}
Tj = ii[WS(is, 2)];
Tk = ii[WS(is, 6)];
Tl = Tj + Tk;
To = Tj - Tk;
{
E Tb, Tc, Tv, Tw, Tx, Ty;
Tb = ri[WS(is, 7)];
Tc = ri[WS(is, 3)];
Tv = Tb - Tc;
Tw = ii[WS(is, 7)];
Tx = ii[WS(is, 3)];
Ty = Tw - Tx;
Td = Tb + Tc;
TN = Tw + Tx;
Tz = Tv - Ty;
TH = Tv + Ty;
}
{
E T8, T9, Tq, Tr, Ts, Tt;
T8 = ri[WS(is, 1)];
T9 = ri[WS(is, 5)];
Tq = T8 - T9;
Tr = ii[WS(is, 1)];
Ts = ii[WS(is, 5)];
Tt = Tr - Ts;
Ta = T8 + T9;
TM = Tr + Ts;
Tu = Tq + Tt;
TG = Tt - Tq;
}
}
{
E T7, Te, TP, TQ;
T7 = T3 + T6;
Te = Ta + Td;
ro[WS(os, 4)] = T7 - Te;
ro[0] = T7 + Te;
TP = Ti + Tl;
TQ = TM + TN;
io[WS(os, 4)] = TP - TQ;
io[0] = TP + TQ;
}
{
E Tf, Tm, TL, TO;
Tf = Td - Ta;
Tm = Ti - Tl;
io[WS(os, 2)] = Tf + Tm;
io[WS(os, 6)] = Tm - Tf;
TL = T3 - T6;
TO = TM - TN;
ro[WS(os, 6)] = TL - TO;
ro[WS(os, 2)] = TL + TO;
}
{
E Tp, TA, TJ, TK;
Tp = Tn + To;
TA = KP707106781 * (Tu + Tz);
ro[WS(os, 5)] = Tp - TA;
ro[WS(os, 1)] = Tp + TA;
TJ = TC - TB;
TK = KP707106781 * (TG + TH);
io[WS(os, 5)] = TJ - TK;
io[WS(os, 1)] = TJ + TK;
}
{
E TD, TE, TF, TI;
TD = TB + TC;
TE = KP707106781 * (Tz - Tu);
io[WS(os, 7)] = TD - TE;
io[WS(os, 3)] = TD + TE;
TF = Tn - To;
TI = KP707106781 * (TG - TH);
ro[WS(os, 7)] = TF - TI;
ro[WS(os, 3)] = TF + TI;
}
}
}
}
static const kdft_desc desc = { 8, "n1_8", { 52, 4, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
}
#endif
+360
View File
@@ -0,0 +1,360 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
/*
* This function contains 80 FP additions, 56 FP multiplications,
* (or, 24 additions, 0 multiplications, 56 fused multiply/add),
* 41 stack variables, 10 constants, and 36 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
E T5, TL, Tm, Tl, T1f, TM, Ta, T1c, TF, TW, TI, TX, Tf, T1d, Ts;
E TZ, Tx, T10;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
T5 = T1 + T4;
TL = FNMS(KP500000000, T4, T1);
Tm = T3 - T2;
}
{
E Th, Ti, Tj, Tk;
Th = ii[0];
Ti = ii[WS(is, 3)];
Tj = ii[WS(is, 6)];
Tk = Ti + Tj;
Tl = FNMS(KP500000000, Tk, Th);
T1f = Th + Tk;
TM = Ti - Tj;
}
{
E T6, Tz, T9, TE, TC, TH, TD, TG;
T6 = ri[WS(is, 1)];
Tz = ii[WS(is, 1)];
{
E T7, T8, TA, TB;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 7)];
T9 = T7 + T8;
TE = T7 - T8;
TA = ii[WS(is, 4)];
TB = ii[WS(is, 7)];
TC = TA + TB;
TH = TB - TA;
}
Ta = T6 + T9;
T1c = Tz + TC;
TD = FNMS(KP500000000, TC, Tz);
TF = FNMS(KP866025403, TE, TD);
TW = FMA(KP866025403, TE, TD);
TG = FNMS(KP500000000, T9, T6);
TI = FNMS(KP866025403, TH, TG);
TX = FMA(KP866025403, TH, TG);
}
{
E Tb, Tt, Te, Tw, Tr, Tu, To, Tv;
Tb = ri[WS(is, 2)];
Tt = ii[WS(is, 2)];
{
E Tc, Td, Tp, Tq;
Tc = ri[WS(is, 5)];
Td = ri[WS(is, 8)];
Te = Tc + Td;
Tw = Td - Tc;
Tp = ii[WS(is, 5)];
Tq = ii[WS(is, 8)];
Tr = Tp - Tq;
Tu = Tp + Tq;
}
Tf = Tb + Te;
T1d = Tt + Tu;
To = FNMS(KP500000000, Te, Tb);
Ts = FMA(KP866025403, Tr, To);
TZ = FNMS(KP866025403, Tr, To);
Tv = FNMS(KP500000000, Tu, Tt);
Tx = FMA(KP866025403, Tw, Tv);
T10 = FNMS(KP866025403, Tw, Tv);
}
{
E T1e, Tg, T1b, T1i, T1g, T1h;
T1e = T1c - T1d;
Tg = Ta + Tf;
T1b = FNMS(KP500000000, Tg, T5);
ro[0] = T5 + Tg;
ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
T1i = Tf - Ta;
T1g = T1c + T1d;
T1h = FNMS(KP500000000, T1g, T1f);
io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
io[0] = T1f + T1g;
io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
}
{
E Tn, TN, TK, TS, TQ, TU, TR, TT;
Tn = FMA(KP866025403, Tm, Tl);
TN = FMA(KP866025403, TM, TL);
{
E Ty, TJ, TO, TP;
Ty = FNMS(KP176326980, Tx, Ts);
TJ = FNMS(KP839099631, TI, TF);
TK = FNMS(KP777861913, TJ, Ty);
TS = FMA(KP777861913, TJ, Ty);
TO = FMA(KP176326980, Ts, Tx);
TP = FMA(KP839099631, TF, TI);
TQ = FMA(KP777861913, TP, TO);
TU = FNMS(KP777861913, TP, TO);
}
io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
TR = FNMS(KP492403876, TQ, TN);
ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
TT = FMA(KP492403876, TK, Tn);
io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
io[WS(os, 4)] = FMA(KP852868531, TU, TT);
}
{
E TV, T17, T12, T1a, T16, T18, T13, T19;
TV = FNMS(KP866025403, TM, TL);
T17 = FNMS(KP866025403, Tm, Tl);
{
E TY, T11, T14, T15;
TY = FMA(KP176326980, TX, TW);
T11 = FNMS(KP363970234, T10, TZ);
T12 = FNMS(KP954188894, T11, TY);
T1a = FMA(KP954188894, T11, TY);
T14 = FNMS(KP176326980, TW, TX);
T15 = FMA(KP363970234, TZ, T10);
T16 = FNMS(KP954188894, T15, T14);
T18 = FMA(KP954188894, T15, T14);
}
ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
T13 = FNMS(KP492403876, T12, TV);
ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
T19 = FMA(KP492403876, T18, T17);
io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
}
}
}
}
static const kdft_desc desc = { 9, "n1_9", { 24, 0, 56, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
/*
* This function contains 80 FP additions, 40 FP multiplications,
* (or, 60 additions, 20 multiplications, 20 fused multiply/add),
* 39 stack variables, 8 constants, and 36 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
E T10, TG, TZ;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
T5 = T1 + T4;
TO = KP866025403 * (T3 - T2);
Th = FNMS(KP500000000, T4, T1);
}
{
E TP, Ti, Tj, TQ;
TP = ii[0];
Ti = ii[WS(is, 3)];
Tj = ii[WS(is, 6)];
TQ = Ti + Tj;
Tk = KP866025403 * (Ti - Tj);
T1g = TP + TQ;
TR = FNMS(KP500000000, TQ, TP);
}
{
E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
T6 = ri[WS(is, 1)];
Ts = ii[WS(is, 1)];
{
E T7, T8, Tn, To;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 7)];
T9 = T7 + T8;
Tr = KP866025403 * (T8 - T7);
Tn = ii[WS(is, 4)];
To = ii[WS(is, 7)];
Tp = KP866025403 * (Tn - To);
Tt = Tn + To;
}
Ta = T6 + T9;
T1c = Ts + Tt;
Tm = FNMS(KP500000000, T9, T6);
Tq = Tm + Tp;
TW = Tm - Tp;
Tu = FNMS(KP500000000, Tt, Ts);
Tv = Tr + Tu;
TX = Tu - Tr;
}
{
E Tb, TD, Te, TC, TA, TE, Tx, TF;
Tb = ri[WS(is, 2)];
TD = ii[WS(is, 2)];
{
E Tc, Td, Ty, Tz;
Tc = ri[WS(is, 5)];
Td = ri[WS(is, 8)];
Te = Tc + Td;
TC = KP866025403 * (Td - Tc);
Ty = ii[WS(is, 5)];
Tz = ii[WS(is, 8)];
TA = KP866025403 * (Ty - Tz);
TE = Ty + Tz;
}
Tf = Tb + Te;
T1d = TD + TE;
Tx = FNMS(KP500000000, Te, Tb);
TB = Tx + TA;
T10 = Tx - TA;
TF = FNMS(KP500000000, TE, TD);
TG = TC + TF;
TZ = TF - TC;
}
{
E T1e, Tg, T1b, T1f, T1h, T1i;
T1e = KP866025403 * (T1c - T1d);
Tg = Ta + Tf;
T1b = FNMS(KP500000000, Tg, T5);
ro[0] = T5 + Tg;
ro[WS(os, 3)] = T1b + T1e;
ro[WS(os, 6)] = T1b - T1e;
T1f = KP866025403 * (Tf - Ta);
T1h = T1c + T1d;
T1i = FNMS(KP500000000, T1h, T1g);
io[WS(os, 3)] = T1f + T1i;
io[0] = T1g + T1h;
io[WS(os, 6)] = T1i - T1f;
}
{
E Tl, TS, TI, TN, TM, TT, TJ, TU;
Tl = Th + Tk;
TS = TO + TR;
{
E Tw, TH, TK, TL;
Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
TH = FMA(KP173648177, TB, KP984807753 * TG);
TI = Tw + TH;
TN = KP866025403 * (TH - Tw);
TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
TL = FNMS(KP984807753, TB, KP173648177 * TG);
TM = KP866025403 * (TK - TL);
TT = TK + TL;
}
ro[WS(os, 1)] = Tl + TI;
io[WS(os, 1)] = TS + TT;
TJ = FNMS(KP500000000, TI, Tl);
ro[WS(os, 7)] = TJ - TM;
ro[WS(os, 4)] = TJ + TM;
TU = FNMS(KP500000000, TT, TS);
io[WS(os, 4)] = TN + TU;
io[WS(os, 7)] = TU - TN;
}
{
E TV, T14, T12, T13, T17, T1a, T18, T19;
TV = Th - Tk;
T14 = TR - TO;
{
E TY, T11, T15, T16;
TY = FMA(KP173648177, TW, KP984807753 * TX);
T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
T12 = TY + T11;
T13 = KP866025403 * (T11 - TY);
T15 = FNMS(KP984807753, TW, KP173648177 * TX);
T16 = FMA(KP342020143, T10, KP939692620 * TZ);
T17 = T15 - T16;
T1a = KP866025403 * (T15 + T16);
}
ro[WS(os, 2)] = TV + T12;
io[WS(os, 2)] = T14 + T17;
T18 = FNMS(KP500000000, T17, T14);
io[WS(os, 5)] = T13 + T18;
io[WS(os, 8)] = T18 - T13;
T19 = FNMS(KP500000000, T12, TV);
ro[WS(os, 8)] = T19 - T1a;
ro[WS(os, 5)] = T19 + T1a;
}
}
}
}
static const kdft_desc desc = { 9, "n1_9", { 60, 20, 20, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
}
#endif
+149
View File
@@ -0,0 +1,149 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
/*
* This function contains 12 FP additions, 8 FP multiplications,
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
* 17 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T2, T4, T7, T8, T9, Tb, Tc, Te, Th, Ti, Tj;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T4 = T1 - T2;
T7 = iio[0];
T8 = iio[WS(rs, 1)];
T9 = T7 - T8;
Tb = rio[WS(vs, 1)];
Tc = rio[WS(vs, 1) + WS(rs, 1)];
Te = Tb - Tc;
Th = iio[WS(vs, 1)];
Ti = iio[WS(vs, 1) + WS(rs, 1)];
Tj = Th - Ti;
rio[0] = T1 + T2;
iio[0] = T7 + T8;
rio[WS(rs, 1)] = Tb + Tc;
iio[WS(rs, 1)] = Th + Ti;
{
E Tf, Tk, Td, Tg;
Td = W[0];
Tf = Td * Te;
Tk = Td * Tj;
Tg = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tg, Tj, Tf);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tg, Te, Tk);
}
{
E T5, Ta, T3, T6;
T3 = W[0];
T5 = T3 * T4;
Ta = T3 * T9;
T6 = W[1];
rio[WS(vs, 1)] = FMA(T6, T9, T5);
iio[WS(vs, 1)] = FNMS(T6, T4, Ta);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
void X(codelet_q1_2) (planner *p) {
X(kdft_difsq_register) (p, q1_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
/*
* This function contains 12 FP additions, 8 FP multiplications,
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
* 17 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T2, T4, T6, T7, T8, T9, Ta, Tc, Te, Tf, Tg;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T4 = T1 - T2;
T6 = iio[0];
T7 = iio[WS(rs, 1)];
T8 = T6 - T7;
T9 = rio[WS(vs, 1)];
Ta = rio[WS(vs, 1) + WS(rs, 1)];
Tc = T9 - Ta;
Te = iio[WS(vs, 1)];
Tf = iio[WS(vs, 1) + WS(rs, 1)];
Tg = Te - Tf;
rio[0] = T1 + T2;
iio[0] = T6 + T7;
rio[WS(rs, 1)] = T9 + Ta;
iio[WS(rs, 1)] = Te + Tf;
{
E Tb, Td, T3, T5;
Tb = W[0];
Td = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tb, Tc, Td * Tg);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Td, Tc, Tb * Tg);
T3 = W[0];
T5 = W[1];
rio[WS(vs, 1)] = FMA(T3, T4, T5 * T8);
iio[WS(vs, 1)] = FNMS(T5, T4, T3 * T8);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
void X(codelet_q1_2) (planner *p) {
X(kdft_difsq_register) (p, q1_2, &desc);
}
#endif
+316
View File
@@ -0,0 +1,316 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
/*
* This function contains 48 FP additions, 42 FP multiplications,
* (or, 18 additions, 12 multiplications, 30 fused multiply/add),
* 35 stack variables, 2 constants, and 36 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T4, T6, Tg, Td, Te, T9, Tf, Tp, Ts, Tu, TE, TB, TC, Tx;
E TD, TZ, T10, TV, T11, TN, TQ, TS, T12;
{
E T2, T3, Tv, Tw;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 2)];
T4 = T2 + T3;
T6 = FNMS(KP500000000, T4, T1);
Tg = T3 - T2;
{
E T7, T8, Tq, Tr;
Td = iio[0];
T7 = iio[WS(rs, 1)];
T8 = iio[WS(rs, 2)];
Te = T7 + T8;
T9 = T7 - T8;
Tf = FNMS(KP500000000, Te, Td);
Tp = rio[WS(vs, 1)];
Tq = rio[WS(vs, 1) + WS(rs, 1)];
Tr = rio[WS(vs, 1) + WS(rs, 2)];
Ts = Tq + Tr;
Tu = FNMS(KP500000000, Ts, Tp);
TE = Tr - Tq;
}
TB = iio[WS(vs, 1)];
Tv = iio[WS(vs, 1) + WS(rs, 1)];
Tw = iio[WS(vs, 1) + WS(rs, 2)];
TC = Tv + Tw;
Tx = Tv - Tw;
TD = FNMS(KP500000000, TC, TB);
{
E TT, TU, TO, TP;
TZ = iio[WS(vs, 2)];
TT = iio[WS(vs, 2) + WS(rs, 1)];
TU = iio[WS(vs, 2) + WS(rs, 2)];
T10 = TT + TU;
TV = TT - TU;
T11 = FNMS(KP500000000, T10, TZ);
TN = rio[WS(vs, 2)];
TO = rio[WS(vs, 2) + WS(rs, 1)];
TP = rio[WS(vs, 2) + WS(rs, 2)];
TQ = TO + TP;
TS = FNMS(KP500000000, TQ, TN);
T12 = TP - TO;
}
}
rio[0] = T1 + T4;
iio[0] = Td + Te;
rio[WS(rs, 1)] = Tp + Ts;
iio[WS(rs, 1)] = TB + TC;
iio[WS(rs, 2)] = TZ + T10;
rio[WS(rs, 2)] = TN + TQ;
{
E Ta, Th, Tb, Ti, T5, Tc;
Ta = FMA(KP866025403, T9, T6);
Th = FMA(KP866025403, Tg, Tf);
T5 = W[0];
Tb = T5 * Ta;
Ti = T5 * Th;
Tc = W[1];
rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
}
{
E T16, T19, T17, T1a, T15, T18;
T16 = FNMS(KP866025403, TV, TS);
T19 = FNMS(KP866025403, T12, T11);
T15 = W[2];
T17 = T15 * T16;
T1a = T15 * T19;
T18 = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
}
{
E TI, TL, TJ, TM, TH, TK;
TI = FNMS(KP866025403, Tx, Tu);
TL = FNMS(KP866025403, TE, TD);
TH = W[2];
TJ = TH * TI;
TM = TH * TL;
TK = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
}
{
E Ty, TF, Tz, TG, Tt, TA;
Ty = FMA(KP866025403, Tx, Tu);
TF = FMA(KP866025403, TE, TD);
Tt = W[0];
Tz = Tt * Ty;
TG = Tt * TF;
TA = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
}
{
E TW, T13, TX, T14, TR, TY;
TW = FMA(KP866025403, TV, TS);
T13 = FMA(KP866025403, T12, T11);
TR = W[0];
TX = TR * TW;
T14 = TR * T13;
TY = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
}
{
E Tk, Tn, Tl, To, Tj, Tm;
Tk = FNMS(KP866025403, T9, T6);
Tn = FNMS(KP866025403, Tg, Tf);
Tj = W[2];
Tl = Tj * Tk;
To = Tj * Tn;
Tm = W[3];
rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 18, 12, 30, 0 }, 0, 0, 0 };
void X(codelet_q1_3) (planner *p) {
X(kdft_difsq_register) (p, q1_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
/*
* This function contains 48 FP additions, 36 FP multiplications,
* (or, 30 additions, 18 multiplications, 18 fused multiply/add),
* 35 stack variables, 2 constants, and 36 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
{
E T2, T3, Tr, Ts;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 2)];
T4 = T2 + T3;
T6 = FNMS(KP500000000, T4, T1);
Tc = KP866025403 * (T3 - T2);
{
E T7, T8, Tm, Tn;
Td = iio[0];
T7 = iio[WS(rs, 1)];
T8 = iio[WS(rs, 2)];
Te = T7 + T8;
T9 = KP866025403 * (T7 - T8);
Tf = FNMS(KP500000000, Te, Td);
Tl = rio[WS(vs, 1)];
Tm = rio[WS(vs, 1) + WS(rs, 1)];
Tn = rio[WS(vs, 1) + WS(rs, 2)];
To = Tm + Tn;
Tq = FNMS(KP500000000, To, Tl);
Tw = KP866025403 * (Tn - Tm);
}
Tx = iio[WS(vs, 1)];
Tr = iio[WS(vs, 1) + WS(rs, 1)];
Ts = iio[WS(vs, 1) + WS(rs, 2)];
Ty = Tr + Ts;
Tt = KP866025403 * (Tr - Ts);
Tz = FNMS(KP500000000, Ty, Tx);
{
E TL, TM, TG, TH;
TR = iio[WS(vs, 2)];
TL = iio[WS(vs, 2) + WS(rs, 1)];
TM = iio[WS(vs, 2) + WS(rs, 2)];
TS = TL + TM;
TN = KP866025403 * (TL - TM);
TT = FNMS(KP500000000, TS, TR);
TF = rio[WS(vs, 2)];
TG = rio[WS(vs, 2) + WS(rs, 1)];
TH = rio[WS(vs, 2) + WS(rs, 2)];
TI = TG + TH;
TK = FNMS(KP500000000, TI, TF);
TQ = KP866025403 * (TH - TG);
}
}
rio[0] = T1 + T4;
iio[0] = Td + Te;
rio[WS(rs, 1)] = Tl + To;
iio[WS(rs, 1)] = Tx + Ty;
iio[WS(rs, 2)] = TR + TS;
rio[WS(rs, 2)] = TF + TI;
{
E Ta, Tg, T5, Tb;
Ta = T6 + T9;
Tg = Tc + Tf;
T5 = W[0];
Tb = W[1];
rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
}
{
E TW, TY, TV, TX;
TW = TK - TN;
TY = TT - TQ;
TV = W[2];
TX = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
}
{
E TC, TE, TB, TD;
TC = Tq - Tt;
TE = Tz - Tw;
TB = W[2];
TD = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
}
{
E Tu, TA, Tp, Tv;
Tu = Tq + Tt;
TA = Tw + Tz;
Tp = W[0];
Tv = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
}
{
E TO, TU, TJ, TP;
TO = TK + TN;
TU = TQ + TT;
TJ = W[0];
TP = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
}
{
E Ti, Tk, Th, Tj;
Ti = T6 - T9;
Tk = Tf - Tc;
Th = W[2];
Tj = W[3];
rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 30, 18, 18, 0 }, 0, 0, 0 };
void X(codelet_q1_3) (planner *p) {
X(kdft_difsq_register) (p, q1_3, &desc);
}
#endif
+524
View File
@@ -0,0 +1,524 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
/*
* This function contains 88 FP additions, 48 FP multiplications,
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
* 51 stack variables, 0 constants, and 64 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T3, Tv, Tw, T6, Tc, Tf, Tx, Ts, Tm, Ti, T1H, T29, T2a, T1K, T1Q;
E T1T, T2b, T26, T20, T1W, TB, T13, T14, TE, TK, TN, T15, T10, TU, TQ;
E T19, T1B, T1C, T1c, T1i, T1l, T1D, T1y, T1s, T1o;
{
E T1, T2, Tb, Tg, Th, T8;
{
E T9, Ta, T4, T5;
T1 = rio[0];
T2 = rio[WS(rs, 2)];
T3 = T1 + T2;
T9 = iio[0];
Ta = iio[WS(rs, 2)];
Tb = T9 - Ta;
Tv = T9 + Ta;
Tg = iio[WS(rs, 1)];
Th = iio[WS(rs, 3)];
Tw = Tg + Th;
T4 = rio[WS(rs, 1)];
T5 = rio[WS(rs, 3)];
T6 = T4 + T5;
T8 = T4 - T5;
}
Tc = T8 + Tb;
Tf = T1 - T2;
Tx = Tv - Tw;
Ts = T3 - T6;
Tm = Tb - T8;
Ti = Tg - Th;
}
{
E T1F, T1G, T1P, T1U, T1V, T1M;
{
E T1N, T1O, T1I, T1J;
T1F = rio[WS(vs, 3)];
T1G = rio[WS(vs, 3) + WS(rs, 2)];
T1H = T1F + T1G;
T1N = iio[WS(vs, 3)];
T1O = iio[WS(vs, 3) + WS(rs, 2)];
T1P = T1N - T1O;
T29 = T1N + T1O;
T1U = iio[WS(vs, 3) + WS(rs, 1)];
T1V = iio[WS(vs, 3) + WS(rs, 3)];
T2a = T1U + T1V;
T1I = rio[WS(vs, 3) + WS(rs, 1)];
T1J = rio[WS(vs, 3) + WS(rs, 3)];
T1K = T1I + T1J;
T1M = T1I - T1J;
}
T1Q = T1M + T1P;
T1T = T1F - T1G;
T2b = T29 - T2a;
T26 = T1H - T1K;
T20 = T1P - T1M;
T1W = T1U - T1V;
}
{
E Tz, TA, TJ, TO, TP, TG;
{
E TH, TI, TC, TD;
Tz = rio[WS(vs, 1)];
TA = rio[WS(vs, 1) + WS(rs, 2)];
TB = Tz + TA;
TH = iio[WS(vs, 1)];
TI = iio[WS(vs, 1) + WS(rs, 2)];
TJ = TH - TI;
T13 = TH + TI;
TO = iio[WS(vs, 1) + WS(rs, 1)];
TP = iio[WS(vs, 1) + WS(rs, 3)];
T14 = TO + TP;
TC = rio[WS(vs, 1) + WS(rs, 1)];
TD = rio[WS(vs, 1) + WS(rs, 3)];
TE = TC + TD;
TG = TC - TD;
}
TK = TG + TJ;
TN = Tz - TA;
T15 = T13 - T14;
T10 = TB - TE;
TU = TJ - TG;
TQ = TO - TP;
}
{
E T17, T18, T1h, T1m, T1n, T1e;
{
E T1f, T1g, T1a, T1b;
T17 = rio[WS(vs, 2)];
T18 = rio[WS(vs, 2) + WS(rs, 2)];
T19 = T17 + T18;
T1f = iio[WS(vs, 2)];
T1g = iio[WS(vs, 2) + WS(rs, 2)];
T1h = T1f - T1g;
T1B = T1f + T1g;
T1m = iio[WS(vs, 2) + WS(rs, 1)];
T1n = iio[WS(vs, 2) + WS(rs, 3)];
T1C = T1m + T1n;
T1a = rio[WS(vs, 2) + WS(rs, 1)];
T1b = rio[WS(vs, 2) + WS(rs, 3)];
T1c = T1a + T1b;
T1e = T1a - T1b;
}
T1i = T1e + T1h;
T1l = T17 - T18;
T1D = T1B - T1C;
T1y = T19 - T1c;
T1s = T1h - T1e;
T1o = T1m - T1n;
}
rio[0] = T3 + T6;
iio[0] = Tv + Tw;
rio[WS(rs, 1)] = TB + TE;
iio[WS(rs, 1)] = T13 + T14;
rio[WS(rs, 2)] = T19 + T1c;
iio[WS(rs, 2)] = T1B + T1C;
iio[WS(rs, 3)] = T29 + T2a;
rio[WS(rs, 3)] = T1H + T1K;
{
E Tt, Ty, Tr, Tu;
Tr = W[2];
Tt = Tr * Ts;
Ty = Tr * Tx;
Tu = W[3];
rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
}
{
E T27, T2c, T25, T28;
T25 = W[2];
T27 = T25 * T26;
T2c = T25 * T2b;
T28 = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
}
{
E T11, T16, TZ, T12;
TZ = W[2];
T11 = TZ * T10;
T16 = TZ * T15;
T12 = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
}
{
E T1z, T1E, T1x, T1A;
T1x = W[2];
T1z = T1x * T1y;
T1E = T1x * T1D;
T1A = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
}
{
E Tj, Te, Tk, T7, Td;
Tj = Tf - Ti;
Te = W[5];
Tk = Te * Tc;
T7 = W[4];
Td = T7 * Tc;
iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
}
{
E T1p, T1k, T1q, T1d, T1j;
T1p = T1l - T1o;
T1k = W[5];
T1q = T1k * T1i;
T1d = W[4];
T1j = T1d * T1i;
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
}
{
E T23, T22, T24, T1Z, T21;
T23 = T1T + T1W;
T22 = W[1];
T24 = T22 * T20;
T1Z = W[0];
T21 = T1Z * T20;
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
}
{
E TX, TW, TY, TT, TV;
TX = TN + TQ;
TW = W[1];
TY = TW * TU;
TT = W[0];
TV = TT * TU;
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
}
{
E TR, TM, TS, TF, TL;
TR = TN - TQ;
TM = W[5];
TS = TM * TK;
TF = W[4];
TL = TF * TK;
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
}
{
E Tp, To, Tq, Tl, Tn;
Tp = Tf + Ti;
To = W[1];
Tq = To * Tm;
Tl = W[0];
Tn = Tl * Tm;
iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
}
{
E T1v, T1u, T1w, T1r, T1t;
T1v = T1l + T1o;
T1u = W[1];
T1w = T1u * T1s;
T1r = W[0];
T1t = T1r * T1s;
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
}
{
E T1X, T1S, T1Y, T1L, T1R;
T1X = T1T - T1W;
T1S = W[5];
T1Y = T1S * T1Q;
T1L = W[4];
T1R = T1L * T1Q;
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
void X(codelet_q1_4) (planner *p) {
X(kdft_difsq_register) (p, q1_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
/*
* This function contains 88 FP additions, 48 FP multiplications,
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
* 37 stack variables, 0 constants, and 64 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
E T1D, T1N;
{
E T1, T2, T9, Ta;
T1 = rio[0];
T2 = rio[WS(rs, 2)];
T3 = T1 + T2;
Te = T1 - T2;
T9 = iio[0];
Ta = iio[WS(rs, 2)];
Tb = T9 - Ta;
Tq = T9 + Ta;
}
{
E T4, T5, Tf, Tg;
T4 = rio[WS(rs, 1)];
T5 = rio[WS(rs, 3)];
T6 = T4 + T5;
T8 = T4 - T5;
Tf = iio[WS(rs, 1)];
Tg = iio[WS(rs, 3)];
Th = Tf - Tg;
Tr = Tf + Tg;
}
{
E Tt, Tu, TB, TC;
Tt = rio[WS(vs, 1)];
Tu = rio[WS(vs, 1) + WS(rs, 2)];
Tv = Tt + Tu;
TG = Tt - Tu;
TB = iio[WS(vs, 1)];
TC = iio[WS(vs, 1) + WS(rs, 2)];
TD = TB - TC;
TS = TB + TC;
}
{
E Tw, Tx, TH, TI;
Tw = rio[WS(vs, 1) + WS(rs, 1)];
Tx = rio[WS(vs, 1) + WS(rs, 3)];
Ty = Tw + Tx;
TA = Tw - Tx;
TH = iio[WS(vs, 1) + WS(rs, 1)];
TI = iio[WS(vs, 1) + WS(rs, 3)];
TJ = TH - TI;
TT = TH + TI;
}
{
E TV, TW, T13, T14;
TV = rio[WS(vs, 2)];
TW = rio[WS(vs, 2) + WS(rs, 2)];
TX = TV + TW;
T18 = TV - TW;
T13 = iio[WS(vs, 2)];
T14 = iio[WS(vs, 2) + WS(rs, 2)];
T15 = T13 - T14;
T1k = T13 + T14;
}
{
E TY, TZ, T19, T1a;
TY = rio[WS(vs, 2) + WS(rs, 1)];
TZ = rio[WS(vs, 2) + WS(rs, 3)];
T10 = TY + TZ;
T12 = TY - TZ;
T19 = iio[WS(vs, 2) + WS(rs, 1)];
T1a = iio[WS(vs, 2) + WS(rs, 3)];
T1b = T19 - T1a;
T1l = T19 + T1a;
}
{
E T1n, T1o, T1v, T1w;
T1n = rio[WS(vs, 3)];
T1o = rio[WS(vs, 3) + WS(rs, 2)];
T1p = T1n + T1o;
T1A = T1n - T1o;
T1v = iio[WS(vs, 3)];
T1w = iio[WS(vs, 3) + WS(rs, 2)];
T1x = T1v - T1w;
T1M = T1v + T1w;
}
{
E T1q, T1r, T1B, T1C;
T1q = rio[WS(vs, 3) + WS(rs, 1)];
T1r = rio[WS(vs, 3) + WS(rs, 3)];
T1s = T1q + T1r;
T1u = T1q - T1r;
T1B = iio[WS(vs, 3) + WS(rs, 1)];
T1C = iio[WS(vs, 3) + WS(rs, 3)];
T1D = T1B - T1C;
T1N = T1B + T1C;
}
rio[0] = T3 + T6;
iio[0] = Tq + Tr;
rio[WS(rs, 1)] = Tv + Ty;
iio[WS(rs, 1)] = TS + TT;
rio[WS(rs, 2)] = TX + T10;
iio[WS(rs, 2)] = T1k + T1l;
iio[WS(rs, 3)] = T1M + T1N;
rio[WS(rs, 3)] = T1p + T1s;
{
E Tc, Ti, T7, Td;
Tc = T8 + Tb;
Ti = Te - Th;
T7 = W[4];
Td = W[5];
iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
}
{
E T1K, T1O, T1J, T1L;
T1K = T1p - T1s;
T1O = T1M - T1N;
T1J = W[2];
T1L = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
}
{
E Tk, Tm, Tj, Tl;
Tk = Tb - T8;
Tm = Te + Th;
Tj = W[0];
Tl = W[1];
iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
}
{
E To, Ts, Tn, Tp;
To = T3 - T6;
Ts = Tq - Tr;
Tn = W[2];
Tp = W[3];
rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
}
{
E T16, T1c, T11, T17;
T16 = T12 + T15;
T1c = T18 - T1b;
T11 = W[4];
T17 = W[5];
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
}
{
E T1G, T1I, T1F, T1H;
T1G = T1x - T1u;
T1I = T1A + T1D;
T1F = W[0];
T1H = W[1];
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
}
{
E TQ, TU, TP, TR;
TQ = Tv - Ty;
TU = TS - TT;
TP = W[2];
TR = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
}
{
E T1e, T1g, T1d, T1f;
T1e = T15 - T12;
T1g = T18 + T1b;
T1d = W[0];
T1f = W[1];
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
}
{
E T1i, T1m, T1h, T1j;
T1i = TX - T10;
T1m = T1k - T1l;
T1h = W[2];
T1j = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
}
{
E T1y, T1E, T1t, T1z;
T1y = T1u + T1x;
T1E = T1A - T1D;
T1t = W[4];
T1z = W[5];
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
}
{
E TM, TO, TL, TN;
TM = TD - TA;
TO = TG + TJ;
TL = W[0];
TN = W[1];
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
}
{
E TE, TK, Tz, TF;
TE = TA + TD;
TK = TG - TJ;
Tz = W[4];
TF = W[5];
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
void X(codelet_q1_4) (planner *p) {
X(kdft_difsq_register) (p, q1_4, &desc);
}
#endif
+992
View File
@@ -0,0 +1,992 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
/*
* This function contains 200 FP additions, 170 FP multiplications,
* (or, 70 additions, 40 multiplications, 130 fused multiply/add),
* 75 stack variables, 4 constants, and 100 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr, TV, T15, T1G;
E T1q, T12, T14, T1h, T1d, T1B, T1m, T1k, T1l, T1P, T1Z, T2A, T2k, T1W, T1Y;
E T2b, T27, T2v, T2g, T2e, T2f, T3Z, T3V, T4j, T44, T42, T43, T3D, T3N, T4o;
E T48, T3K, T3M, T2J, T2T, T3u, T3e, T2Q, T2S, T35, T31, T3p, T3a, T38, T39;
{
E T7, Tv, T4, Tu;
T1 = rio[0];
{
E T5, T6, T2, T3;
T5 = rio[WS(rs, 2)];
T6 = rio[WS(rs, 3)];
T7 = T5 + T6;
Tv = T5 - T6;
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 4)];
T4 = T2 + T3;
Tu = T2 - T3;
}
Tb = T4 - T7;
TM = FNMS(KP618033988, Tu, Tv);
Tw = FMA(KP618033988, Tv, Tu);
T8 = T4 + T7;
Ta = FNMS(KP250000000, T8, T1);
}
{
E Ti, Tp, Tf, To;
Tn = iio[0];
{
E Tg, Th, Td, Te;
Tg = iio[WS(rs, 2)];
Th = iio[WS(rs, 3)];
Ti = Tg - Th;
Tp = Tg + Th;
Td = iio[WS(rs, 1)];
Te = iio[WS(rs, 4)];
Tf = Td - Te;
To = Td + Te;
}
Tj = FMA(KP618033988, Ti, Tf);
TH = FNMS(KP618033988, Tf, Ti);
Ts = To - Tp;
Tq = To + Tp;
Tr = FNMS(KP250000000, Tq, Tn);
}
{
E T11, T1p, TY, T1o;
TV = rio[WS(vs, 1)];
{
E TZ, T10, TW, TX;
TZ = rio[WS(vs, 1) + WS(rs, 2)];
T10 = rio[WS(vs, 1) + WS(rs, 3)];
T11 = TZ + T10;
T1p = TZ - T10;
TW = rio[WS(vs, 1) + WS(rs, 1)];
TX = rio[WS(vs, 1) + WS(rs, 4)];
TY = TW + TX;
T1o = TW - TX;
}
T15 = TY - T11;
T1G = FNMS(KP618033988, T1o, T1p);
T1q = FMA(KP618033988, T1p, T1o);
T12 = TY + T11;
T14 = FNMS(KP250000000, T12, TV);
}
{
E T1c, T1j, T19, T1i;
T1h = iio[WS(vs, 1)];
{
E T1a, T1b, T17, T18;
T1a = iio[WS(vs, 1) + WS(rs, 2)];
T1b = iio[WS(vs, 1) + WS(rs, 3)];
T1c = T1a - T1b;
T1j = T1a + T1b;
T17 = iio[WS(vs, 1) + WS(rs, 1)];
T18 = iio[WS(vs, 1) + WS(rs, 4)];
T19 = T17 - T18;
T1i = T17 + T18;
}
T1d = FMA(KP618033988, T1c, T19);
T1B = FNMS(KP618033988, T19, T1c);
T1m = T1i - T1j;
T1k = T1i + T1j;
T1l = FNMS(KP250000000, T1k, T1h);
}
{
E T1V, T2j, T1S, T2i;
T1P = rio[WS(vs, 2)];
{
E T1T, T1U, T1Q, T1R;
T1T = rio[WS(vs, 2) + WS(rs, 2)];
T1U = rio[WS(vs, 2) + WS(rs, 3)];
T1V = T1T + T1U;
T2j = T1T - T1U;
T1Q = rio[WS(vs, 2) + WS(rs, 1)];
T1R = rio[WS(vs, 2) + WS(rs, 4)];
T1S = T1Q + T1R;
T2i = T1Q - T1R;
}
T1Z = T1S - T1V;
T2A = FNMS(KP618033988, T2i, T2j);
T2k = FMA(KP618033988, T2j, T2i);
T1W = T1S + T1V;
T1Y = FNMS(KP250000000, T1W, T1P);
}
{
E T26, T2d, T23, T2c;
T2b = iio[WS(vs, 2)];
{
E T24, T25, T21, T22;
T24 = iio[WS(vs, 2) + WS(rs, 2)];
T25 = iio[WS(vs, 2) + WS(rs, 3)];
T26 = T24 - T25;
T2d = T24 + T25;
T21 = iio[WS(vs, 2) + WS(rs, 1)];
T22 = iio[WS(vs, 2) + WS(rs, 4)];
T23 = T21 - T22;
T2c = T21 + T22;
}
T27 = FMA(KP618033988, T26, T23);
T2v = FNMS(KP618033988, T23, T26);
T2g = T2c - T2d;
T2e = T2c + T2d;
T2f = FNMS(KP250000000, T2e, T2b);
}
{
E T3U, T41, T3R, T40;
T3Z = iio[WS(vs, 4)];
{
E T3S, T3T, T3P, T3Q;
T3S = iio[WS(vs, 4) + WS(rs, 2)];
T3T = iio[WS(vs, 4) + WS(rs, 3)];
T3U = T3S - T3T;
T41 = T3S + T3T;
T3P = iio[WS(vs, 4) + WS(rs, 1)];
T3Q = iio[WS(vs, 4) + WS(rs, 4)];
T3R = T3P - T3Q;
T40 = T3P + T3Q;
}
T3V = FMA(KP618033988, T3U, T3R);
T4j = FNMS(KP618033988, T3R, T3U);
T44 = T40 - T41;
T42 = T40 + T41;
T43 = FNMS(KP250000000, T42, T3Z);
}
{
E T3J, T47, T3G, T46;
T3D = rio[WS(vs, 4)];
{
E T3H, T3I, T3E, T3F;
T3H = rio[WS(vs, 4) + WS(rs, 2)];
T3I = rio[WS(vs, 4) + WS(rs, 3)];
T3J = T3H + T3I;
T47 = T3H - T3I;
T3E = rio[WS(vs, 4) + WS(rs, 1)];
T3F = rio[WS(vs, 4) + WS(rs, 4)];
T3G = T3E + T3F;
T46 = T3E - T3F;
}
T3N = T3G - T3J;
T4o = FNMS(KP618033988, T46, T47);
T48 = FMA(KP618033988, T47, T46);
T3K = T3G + T3J;
T3M = FNMS(KP250000000, T3K, T3D);
}
{
E T2P, T3d, T2M, T3c;
T2J = rio[WS(vs, 3)];
{
E T2N, T2O, T2K, T2L;
T2N = rio[WS(vs, 3) + WS(rs, 2)];
T2O = rio[WS(vs, 3) + WS(rs, 3)];
T2P = T2N + T2O;
T3d = T2N - T2O;
T2K = rio[WS(vs, 3) + WS(rs, 1)];
T2L = rio[WS(vs, 3) + WS(rs, 4)];
T2M = T2K + T2L;
T3c = T2K - T2L;
}
T2T = T2M - T2P;
T3u = FNMS(KP618033988, T3c, T3d);
T3e = FMA(KP618033988, T3d, T3c);
T2Q = T2M + T2P;
T2S = FNMS(KP250000000, T2Q, T2J);
}
{
E T30, T37, T2X, T36;
T35 = iio[WS(vs, 3)];
{
E T2Y, T2Z, T2V, T2W;
T2Y = iio[WS(vs, 3) + WS(rs, 2)];
T2Z = iio[WS(vs, 3) + WS(rs, 3)];
T30 = T2Y - T2Z;
T37 = T2Y + T2Z;
T2V = iio[WS(vs, 3) + WS(rs, 1)];
T2W = iio[WS(vs, 3) + WS(rs, 4)];
T2X = T2V - T2W;
T36 = T2V + T2W;
}
T31 = FMA(KP618033988, T30, T2X);
T3p = FNMS(KP618033988, T2X, T30);
T3a = T36 - T37;
T38 = T36 + T37;
T39 = FNMS(KP250000000, T38, T35);
}
rio[0] = T1 + T8;
iio[0] = Tn + Tq;
rio[WS(rs, 1)] = TV + T12;
iio[WS(rs, 1)] = T1h + T1k;
rio[WS(rs, 2)] = T1P + T1W;
iio[WS(rs, 2)] = T2b + T2e;
iio[WS(rs, 4)] = T3Z + T42;
rio[WS(rs, 4)] = T3D + T3K;
rio[WS(rs, 3)] = T2J + T2Q;
iio[WS(rs, 3)] = T35 + T38;
{
E Tk, TA, Tx, TD, Tc, Tt;
Tc = FMA(KP559016994, Tb, Ta);
Tk = FMA(KP951056516, Tj, Tc);
TA = FNMS(KP951056516, Tj, Tc);
Tt = FMA(KP559016994, Ts, Tr);
Tx = FNMS(KP951056516, Tw, Tt);
TD = FMA(KP951056516, Tw, Tt);
{
E Tl, Ty, T9, Tm;
T9 = W[0];
Tl = T9 * Tk;
Ty = T9 * Tx;
Tm = W[1];
rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
}
{
E TB, TE, Tz, TC;
Tz = W[6];
TB = Tz * TA;
TE = Tz * TD;
TC = W[7];
rio[WS(vs, 4)] = FMA(TC, TD, TB);
iio[WS(vs, 4)] = FNMS(TC, TA, TE);
}
}
{
E TI, TQ, TN, TT, TG, TL;
TG = FNMS(KP559016994, Tb, Ta);
TI = FNMS(KP951056516, TH, TG);
TQ = FMA(KP951056516, TH, TG);
TL = FNMS(KP559016994, Ts, Tr);
TN = FMA(KP951056516, TM, TL);
TT = FNMS(KP951056516, TM, TL);
{
E TJ, TO, TF, TK;
TF = W[2];
TJ = TF * TI;
TO = TF * TN;
TK = W[3];
rio[WS(vs, 2)] = FMA(TK, TN, TJ);
iio[WS(vs, 2)] = FNMS(TK, TI, TO);
}
{
E TR, TU, TP, TS;
TP = W[4];
TR = TP * TQ;
TU = TP * TT;
TS = W[5];
rio[WS(vs, 3)] = FMA(TS, TT, TR);
iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
}
}
{
E T2w, T2E, T2B, T2H, T2u, T2z;
T2u = FNMS(KP559016994, T1Z, T1Y);
T2w = FNMS(KP951056516, T2v, T2u);
T2E = FMA(KP951056516, T2v, T2u);
T2z = FNMS(KP559016994, T2g, T2f);
T2B = FMA(KP951056516, T2A, T2z);
T2H = FNMS(KP951056516, T2A, T2z);
{
E T2x, T2C, T2t, T2y;
T2t = W[2];
T2x = T2t * T2w;
T2C = T2t * T2B;
T2y = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
}
{
E T2F, T2I, T2D, T2G;
T2D = W[4];
T2F = T2D * T2E;
T2I = T2D * T2H;
T2G = W[5];
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
}
}
{
E T4k, T4s, T4p, T4v, T4i, T4n;
T4i = FNMS(KP559016994, T3N, T3M);
T4k = FNMS(KP951056516, T4j, T4i);
T4s = FMA(KP951056516, T4j, T4i);
T4n = FNMS(KP559016994, T44, T43);
T4p = FMA(KP951056516, T4o, T4n);
T4v = FNMS(KP951056516, T4o, T4n);
{
E T4l, T4q, T4h, T4m;
T4h = W[2];
T4l = T4h * T4k;
T4q = T4h * T4p;
T4m = W[3];
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
}
{
E T4t, T4w, T4r, T4u;
T4r = W[4];
T4t = T4r * T4s;
T4w = T4r * T4v;
T4u = W[5];
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
}
}
{
E T28, T2o, T2l, T2r, T20, T2h;
T20 = FMA(KP559016994, T1Z, T1Y);
T28 = FMA(KP951056516, T27, T20);
T2o = FNMS(KP951056516, T27, T20);
T2h = FMA(KP559016994, T2g, T2f);
T2l = FNMS(KP951056516, T2k, T2h);
T2r = FMA(KP951056516, T2k, T2h);
{
E T29, T2m, T1X, T2a;
T1X = W[0];
T29 = T1X * T28;
T2m = T1X * T2l;
T2a = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
}
{
E T2p, T2s, T2n, T2q;
T2n = W[6];
T2p = T2n * T2o;
T2s = T2n * T2r;
T2q = W[7];
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
}
}
{
E T32, T3i, T3f, T3l, T2U, T3b;
T2U = FMA(KP559016994, T2T, T2S);
T32 = FMA(KP951056516, T31, T2U);
T3i = FNMS(KP951056516, T31, T2U);
T3b = FMA(KP559016994, T3a, T39);
T3f = FNMS(KP951056516, T3e, T3b);
T3l = FMA(KP951056516, T3e, T3b);
{
E T33, T3g, T2R, T34;
T2R = W[0];
T33 = T2R * T32;
T3g = T2R * T3f;
T34 = W[1];
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
}
{
E T3j, T3m, T3h, T3k;
T3h = W[6];
T3j = T3h * T3i;
T3m = T3h * T3l;
T3k = W[7];
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
}
}
{
E T3q, T3y, T3v, T3B, T3o, T3t;
T3o = FNMS(KP559016994, T2T, T2S);
T3q = FNMS(KP951056516, T3p, T3o);
T3y = FMA(KP951056516, T3p, T3o);
T3t = FNMS(KP559016994, T3a, T39);
T3v = FMA(KP951056516, T3u, T3t);
T3B = FNMS(KP951056516, T3u, T3t);
{
E T3r, T3w, T3n, T3s;
T3n = W[2];
T3r = T3n * T3q;
T3w = T3n * T3v;
T3s = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
}
{
E T3z, T3C, T3x, T3A;
T3x = W[4];
T3z = T3x * T3y;
T3C = T3x * T3B;
T3A = W[5];
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
}
}
{
E T3W, T4c, T49, T4f, T3O, T45;
T3O = FMA(KP559016994, T3N, T3M);
T3W = FMA(KP951056516, T3V, T3O);
T4c = FNMS(KP951056516, T3V, T3O);
T45 = FMA(KP559016994, T44, T43);
T49 = FNMS(KP951056516, T48, T45);
T4f = FMA(KP951056516, T48, T45);
{
E T3X, T4a, T3L, T3Y;
T3L = W[0];
T3X = T3L * T3W;
T4a = T3L * T49;
T3Y = W[1];
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
}
{
E T4d, T4g, T4b, T4e;
T4b = W[6];
T4d = T4b * T4c;
T4g = T4b * T4f;
T4e = W[7];
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
}
}
{
E T1C, T1K, T1H, T1N, T1A, T1F;
T1A = FNMS(KP559016994, T15, T14);
T1C = FNMS(KP951056516, T1B, T1A);
T1K = FMA(KP951056516, T1B, T1A);
T1F = FNMS(KP559016994, T1m, T1l);
T1H = FMA(KP951056516, T1G, T1F);
T1N = FNMS(KP951056516, T1G, T1F);
{
E T1D, T1I, T1z, T1E;
T1z = W[2];
T1D = T1z * T1C;
T1I = T1z * T1H;
T1E = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
}
{
E T1L, T1O, T1J, T1M;
T1J = W[4];
T1L = T1J * T1K;
T1O = T1J * T1N;
T1M = W[5];
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
}
}
{
E T1e, T1u, T1r, T1x, T16, T1n;
T16 = FMA(KP559016994, T15, T14);
T1e = FMA(KP951056516, T1d, T16);
T1u = FNMS(KP951056516, T1d, T16);
T1n = FMA(KP559016994, T1m, T1l);
T1r = FNMS(KP951056516, T1q, T1n);
T1x = FMA(KP951056516, T1q, T1n);
{
E T1f, T1s, T13, T1g;
T13 = W[0];
T1f = T13 * T1e;
T1s = T13 * T1r;
T1g = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
}
{
E T1v, T1y, T1t, T1w;
T1t = W[6];
T1v = T1t * T1u;
T1y = T1t * T1x;
T1w = W[7];
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 70, 40, 130, 0 }, 0, 0, 0 };
void X(codelet_q1_5) (planner *p) {
X(kdft_difsq_register) (p, q1_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
/*
* This function contains 200 FP additions, 140 FP multiplications,
* (or, 130 additions, 70 multiplications, 70 fused multiply/add),
* 75 stack variables, 4 constants, and 100 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
{
E T7, Tu, T4, Tt;
T1 = rio[0];
{
E T5, T6, T2, T3;
T5 = rio[WS(rs, 2)];
T6 = rio[WS(rs, 3)];
T7 = T5 + T6;
Tu = T5 - T6;
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 4)];
T4 = T2 + T3;
Tt = T2 - T3;
}
Ta = KP559016994 * (T4 - T7);
TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
T8 = T4 + T7;
Tb = FNMS(KP250000000, T8, T1);
}
{
E Ti, Tn, Tf, Tm;
Tp = iio[0];
{
E Tg, Th, Td, Te;
Tg = iio[WS(rs, 2)];
Th = iio[WS(rs, 3)];
Ti = Tg - Th;
Tn = Tg + Th;
Td = iio[WS(rs, 1)];
Te = iio[WS(rs, 4)];
Tf = Td - Te;
Tm = Td + Te;
}
Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
To = KP559016994 * (Tm - Tn);
Tq = Tm + Tn;
Tr = FNMS(KP250000000, Tq, Tp);
}
{
E TT, T1g, TQ, T1f;
TN = rio[WS(vs, 1)];
{
E TR, TS, TO, TP;
TR = rio[WS(vs, 1) + WS(rs, 2)];
TS = rio[WS(vs, 1) + WS(rs, 3)];
TT = TR + TS;
T1g = TR - TS;
TO = rio[WS(vs, 1) + WS(rs, 1)];
TP = rio[WS(vs, 1) + WS(rs, 4)];
TQ = TO + TP;
T1f = TO - TP;
}
TW = KP559016994 * (TQ - TT);
T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
TU = TQ + TT;
TX = FNMS(KP250000000, TU, TN);
}
{
E T14, T19, T11, T18;
T1b = iio[WS(vs, 1)];
{
E T12, T13, TZ, T10;
T12 = iio[WS(vs, 1) + WS(rs, 2)];
T13 = iio[WS(vs, 1) + WS(rs, 3)];
T14 = T12 - T13;
T19 = T12 + T13;
TZ = iio[WS(vs, 1) + WS(rs, 1)];
T10 = iio[WS(vs, 1) + WS(rs, 4)];
T11 = TZ - T10;
T18 = TZ + T10;
}
T15 = FMA(KP951056516, T11, KP587785252 * T14);
T1p = FNMS(KP587785252, T11, KP951056516 * T14);
T1a = KP559016994 * (T18 - T19);
T1c = T18 + T19;
T1d = FNMS(KP250000000, T1c, T1b);
}
{
E T1F, T22, T1C, T21;
T1z = rio[WS(vs, 2)];
{
E T1D, T1E, T1A, T1B;
T1D = rio[WS(vs, 2) + WS(rs, 2)];
T1E = rio[WS(vs, 2) + WS(rs, 3)];
T1F = T1D + T1E;
T22 = T1D - T1E;
T1A = rio[WS(vs, 2) + WS(rs, 1)];
T1B = rio[WS(vs, 2) + WS(rs, 4)];
T1C = T1A + T1B;
T21 = T1A - T1B;
}
T1I = KP559016994 * (T1C - T1F);
T2e = FNMS(KP587785252, T21, KP951056516 * T22);
T23 = FMA(KP951056516, T21, KP587785252 * T22);
T1G = T1C + T1F;
T1J = FNMS(KP250000000, T1G, T1z);
}
{
E T1Q, T1V, T1N, T1U;
T1X = iio[WS(vs, 2)];
{
E T1O, T1P, T1L, T1M;
T1O = iio[WS(vs, 2) + WS(rs, 2)];
T1P = iio[WS(vs, 2) + WS(rs, 3)];
T1Q = T1O - T1P;
T1V = T1O + T1P;
T1L = iio[WS(vs, 2) + WS(rs, 1)];
T1M = iio[WS(vs, 2) + WS(rs, 4)];
T1N = T1L - T1M;
T1U = T1L + T1M;
}
T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
T1W = KP559016994 * (T1U - T1V);
T1Y = T1U + T1V;
T1Z = FNMS(KP250000000, T1Y, T1X);
}
{
E T3o, T3t, T3l, T3s;
T3v = iio[WS(vs, 4)];
{
E T3m, T3n, T3j, T3k;
T3m = iio[WS(vs, 4) + WS(rs, 2)];
T3n = iio[WS(vs, 4) + WS(rs, 3)];
T3o = T3m - T3n;
T3t = T3m + T3n;
T3j = iio[WS(vs, 4) + WS(rs, 1)];
T3k = iio[WS(vs, 4) + WS(rs, 4)];
T3l = T3j - T3k;
T3s = T3j + T3k;
}
T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
T3u = KP559016994 * (T3s - T3t);
T3w = T3s + T3t;
T3x = FNMS(KP250000000, T3w, T3v);
}
{
E T3d, T3A, T3a, T3z;
T37 = rio[WS(vs, 4)];
{
E T3b, T3c, T38, T39;
T3b = rio[WS(vs, 4) + WS(rs, 2)];
T3c = rio[WS(vs, 4) + WS(rs, 3)];
T3d = T3b + T3c;
T3A = T3b - T3c;
T38 = rio[WS(vs, 4) + WS(rs, 1)];
T39 = rio[WS(vs, 4) + WS(rs, 4)];
T3a = T38 + T39;
T3z = T38 - T39;
}
T3g = KP559016994 * (T3a - T3d);
T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
T3e = T3a + T3d;
T3h = FNMS(KP250000000, T3e, T37);
}
{
E T2r, T2O, T2o, T2N;
T2l = rio[WS(vs, 3)];
{
E T2p, T2q, T2m, T2n;
T2p = rio[WS(vs, 3) + WS(rs, 2)];
T2q = rio[WS(vs, 3) + WS(rs, 3)];
T2r = T2p + T2q;
T2O = T2p - T2q;
T2m = rio[WS(vs, 3) + WS(rs, 1)];
T2n = rio[WS(vs, 3) + WS(rs, 4)];
T2o = T2m + T2n;
T2N = T2m - T2n;
}
T2u = KP559016994 * (T2o - T2r);
T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
T2s = T2o + T2r;
T2v = FNMS(KP250000000, T2s, T2l);
}
{
E T2C, T2H, T2z, T2G;
T2J = iio[WS(vs, 3)];
{
E T2A, T2B, T2x, T2y;
T2A = iio[WS(vs, 3) + WS(rs, 2)];
T2B = iio[WS(vs, 3) + WS(rs, 3)];
T2C = T2A - T2B;
T2H = T2A + T2B;
T2x = iio[WS(vs, 3) + WS(rs, 1)];
T2y = iio[WS(vs, 3) + WS(rs, 4)];
T2z = T2x - T2y;
T2G = T2x + T2y;
}
T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
T2I = KP559016994 * (T2G - T2H);
T2K = T2G + T2H;
T2L = FNMS(KP250000000, T2K, T2J);
}
rio[0] = T1 + T8;
iio[0] = Tp + Tq;
rio[WS(rs, 1)] = TN + TU;
iio[WS(rs, 1)] = T1b + T1c;
rio[WS(rs, 2)] = T1z + T1G;
iio[WS(rs, 2)] = T1X + T1Y;
iio[WS(rs, 4)] = T3v + T3w;
rio[WS(rs, 4)] = T37 + T3e;
rio[WS(rs, 3)] = T2l + T2s;
iio[WS(rs, 3)] = T2J + T2K;
{
E Tk, Ty, Tw, TA, Tc, Ts;
Tc = Ta + Tb;
Tk = Tc + Tj;
Ty = Tc - Tj;
Ts = To + Tr;
Tw = Ts - Tv;
TA = Tv + Ts;
{
E T9, Tl, Tx, Tz;
T9 = W[0];
Tl = W[1];
rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
Tx = W[6];
Tz = W[7];
rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
}
}
{
E TE, TK, TI, TM, TC, TH;
TC = Tb - Ta;
TE = TC - TD;
TK = TC + TD;
TH = Tr - To;
TI = TG + TH;
TM = TH - TG;
{
E TB, TF, TJ, TL;
TB = W[2];
TF = W[3];
rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
TJ = W[4];
TL = W[5];
rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
}
}
{
E T2c, T2i, T2g, T2k, T2a, T2f;
T2a = T1J - T1I;
T2c = T2a - T2b;
T2i = T2a + T2b;
T2f = T1Z - T1W;
T2g = T2e + T2f;
T2k = T2f - T2e;
{
E T29, T2d, T2h, T2j;
T29 = W[2];
T2d = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
T2h = W[4];
T2j = W[5];
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
}
}
{
E T3K, T3Q, T3O, T3S, T3I, T3N;
T3I = T3h - T3g;
T3K = T3I - T3J;
T3Q = T3I + T3J;
T3N = T3x - T3u;
T3O = T3M + T3N;
T3S = T3N - T3M;
{
E T3H, T3L, T3P, T3R;
T3H = W[2];
T3L = W[3];
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
T3P = W[4];
T3R = W[5];
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
}
}
{
E T1S, T26, T24, T28, T1K, T20;
T1K = T1I + T1J;
T1S = T1K + T1R;
T26 = T1K - T1R;
T20 = T1W + T1Z;
T24 = T20 - T23;
T28 = T23 + T20;
{
E T1H, T1T, T25, T27;
T1H = W[0];
T1T = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
T25 = W[6];
T27 = W[7];
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
}
}
{
E T2E, T2S, T2Q, T2U, T2w, T2M;
T2w = T2u + T2v;
T2E = T2w + T2D;
T2S = T2w - T2D;
T2M = T2I + T2L;
T2Q = T2M - T2P;
T2U = T2P + T2M;
{
E T2t, T2F, T2R, T2T;
T2t = W[0];
T2F = W[1];
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
T2R = W[6];
T2T = W[7];
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
}
}
{
E T2Y, T34, T32, T36, T2W, T31;
T2W = T2v - T2u;
T2Y = T2W - T2X;
T34 = T2W + T2X;
T31 = T2L - T2I;
T32 = T30 + T31;
T36 = T31 - T30;
{
E T2V, T2Z, T33, T35;
T2V = W[2];
T2Z = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
T33 = W[4];
T35 = W[5];
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
}
}
{
E T3q, T3E, T3C, T3G, T3i, T3y;
T3i = T3g + T3h;
T3q = T3i + T3p;
T3E = T3i - T3p;
T3y = T3u + T3x;
T3C = T3y - T3B;
T3G = T3B + T3y;
{
E T3f, T3r, T3D, T3F;
T3f = W[0];
T3r = W[1];
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
T3D = W[6];
T3F = W[7];
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
}
}
{
E T1q, T1w, T1u, T1y, T1o, T1t;
T1o = TX - TW;
T1q = T1o - T1p;
T1w = T1o + T1p;
T1t = T1d - T1a;
T1u = T1s + T1t;
T1y = T1t - T1s;
{
E T1n, T1r, T1v, T1x;
T1n = W[2];
T1r = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
T1v = W[4];
T1x = W[5];
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
}
}
{
E T16, T1k, T1i, T1m, TY, T1e;
TY = TW + TX;
T16 = TY + T15;
T1k = TY - T15;
T1e = T1a + T1d;
T1i = T1e - T1h;
T1m = T1h + T1e;
{
E TV, T17, T1j, T1l;
TV = W[0];
T17 = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
T1j = W[6];
T1l = W[7];
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 130, 70, 70, 0 }, 0, 0, 0 };
void X(codelet_q1_5) (planner *p) {
X(kdft_difsq_register) (p, q1_5, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+489
View File
@@ -0,0 +1,489 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
/*
* This function contains 102 FP additions, 72 FP multiplications,
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
* 47 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T1P, T16, T17, T18, T1s, T1x;
E T25, Tl, Ty, Tz, T1I, T1J, T1O, T13, T14, T15, T1h, T1m, T24;
{
E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5;
T1 = ri[0];
T1T = ii[0];
T3 = ri[WS(rs, 5)];
T6 = ii[WS(rs, 5)];
T2 = W[8];
T4 = T2 * T3;
T1R = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1S = FNMS(T5, T3, T1R);
T8 = T1 - T7;
T23 = T1T - T1S;
T12 = T1 + T7;
T1U = T1S + T1T;
}
{
E TF, T1p, TY, T1w, TL, T1r, TS, T1u;
{
E TB, TE, TC, T1o, TA, TD;
TB = ri[WS(rs, 4)];
TE = ii[WS(rs, 4)];
TA = W[6];
TC = TA * TB;
T1o = TA * TE;
TD = W[7];
TF = FMA(TD, TE, TC);
T1p = FNMS(TD, TB, T1o);
}
{
E TU, TX, TV, T1v, TT, TW;
TU = ri[WS(rs, 1)];
TX = ii[WS(rs, 1)];
TT = W[0];
TV = TT * TU;
T1v = TT * TX;
TW = W[1];
TY = FMA(TW, TX, TV);
T1w = FNMS(TW, TU, T1v);
}
{
E TH, TK, TI, T1q, TG, TJ;
TH = ri[WS(rs, 9)];
TK = ii[WS(rs, 9)];
TG = W[16];
TI = TG * TH;
T1q = TG * TK;
TJ = W[17];
TL = FMA(TJ, TK, TI);
T1r = FNMS(TJ, TH, T1q);
}
{
E TO, TR, TP, T1t, TN, TQ;
TO = ri[WS(rs, 6)];
TR = ii[WS(rs, 6)];
TN = W[10];
TP = TN * TO;
T1t = TN * TR;
TQ = W[11];
TS = FMA(TQ, TR, TP);
T1u = FNMS(TQ, TO, T1t);
}
TM = TF - TL;
TZ = TS - TY;
T10 = TM + TZ;
T1F = T1p + T1r;
T1G = T1u + T1w;
T1P = T1F + T1G;
T16 = TF + TL;
T17 = TS + TY;
T18 = T16 + T17;
T1s = T1p - T1r;
T1x = T1u - T1w;
T25 = T1s + T1x;
}
{
E Te, T1e, Tx, T1l, Tk, T1g, Tr, T1j;
{
E Ta, Td, Tb, T1d, T9, Tc;
Ta = ri[WS(rs, 2)];
Td = ii[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
T1d = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
T1e = FNMS(Tc, Ta, T1d);
}
{
E Tt, Tw, Tu, T1k, Ts, Tv;
Tt = ri[WS(rs, 3)];
Tw = ii[WS(rs, 3)];
Ts = W[4];
Tu = Ts * Tt;
T1k = Ts * Tw;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
T1l = FNMS(Tv, Tt, T1k);
}
{
E Tg, Tj, Th, T1f, Tf, Ti;
Tg = ri[WS(rs, 7)];
Tj = ii[WS(rs, 7)];
Tf = W[12];
Th = Tf * Tg;
T1f = Tf * Tj;
Ti = W[13];
Tk = FMA(Ti, Tj, Th);
T1g = FNMS(Ti, Tg, T1f);
}
{
E Tn, Tq, To, T1i, Tm, Tp;
Tn = ri[WS(rs, 8)];
Tq = ii[WS(rs, 8)];
Tm = W[14];
To = Tm * Tn;
T1i = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1j = FNMS(Tp, Tn, T1i);
}
Tl = Te - Tk;
Ty = Tr - Tx;
Tz = Tl + Ty;
T1I = T1e + T1g;
T1J = T1j + T1l;
T1O = T1I + T1J;
T13 = Te + Tk;
T14 = Tr + Tx;
T15 = T13 + T14;
T1h = T1e - T1g;
T1m = T1j - T1l;
T24 = T1h + T1m;
}
{
E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
T1b = Tz - T10;
T11 = Tz + T10;
T1a = FNMS(KP250000000, T11, T8);
T1n = T1h - T1m;
T1y = T1s - T1x;
T1z = FMA(KP618033988, T1y, T1n);
T1B = FNMS(KP618033988, T1n, T1y);
ri[WS(rs, 5)] = T8 + T11;
T1A = FNMS(KP559016994, T1b, T1a);
ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A);
ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
T1c = FMA(KP559016994, T1b, T1a);
ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c);
ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
}
{
E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29;
T28 = T24 - T25;
T26 = T24 + T25;
T27 = FNMS(KP250000000, T26, T23);
T2a = Tl - Ty;
T2b = TM - TZ;
T2c = FMA(KP618033988, T2b, T2a);
T2e = FNMS(KP618033988, T2a, T2b);
ii[WS(rs, 5)] = T26 + T23;
T2d = FNMS(KP559016994, T28, T27);
ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d);
ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
T29 = FMA(KP559016994, T28, T27);
ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29);
ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29);
}
{
E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
T1D = T15 - T18;
T19 = T15 + T18;
T1C = FNMS(KP250000000, T19, T12);
T1H = T1F - T1G;
T1K = T1I - T1J;
T1L = FNMS(KP618033988, T1K, T1H);
T1N = FMA(KP618033988, T1H, T1K);
ri[0] = T12 + T19;
T1M = FMA(KP559016994, T1D, T1C);
ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M);
T1E = FNMS(KP559016994, T1D, T1C);
ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E);
}
{
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
T1W = T1O - T1P;
T1Q = T1O + T1P;
T1V = FNMS(KP250000000, T1Q, T1U);
T1Y = T16 - T17;
T1Z = T13 - T14;
T20 = FNMS(KP618033988, T1Z, T1Y);
T22 = FMA(KP618033988, T1Y, T1Z);
ii[0] = T1Q + T1U;
T21 = FMA(KP559016994, T1W, T1V);
ii[WS(rs, 4)] = FMA(KP951056516, T22, T21);
ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21);
T1X = FNMS(KP559016994, T1W, T1V);
ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 10 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 48, 18, 54, 0 }, 0, 0, 0 };
void X(codelet_t1_10) (planner *p) {
X(kdft_dit_register) (p, t1_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
/*
* This function contains 102 FP additions, 60 FP multiplications,
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
* 45 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g;
E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L;
{
E T1, T1B, T6, T1A;
T1 = ri[0];
T1B = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 5)];
T5 = ii[WS(rs, 5)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1A = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
T1O = T1B - T1A;
TT = T1 + T6;
T1C = T1A + T1B;
}
{
E Tz, T1b, TP, T1f, TE, T1c, TK, T1e;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 4)];
Ty = ii[WS(rs, 4)];
Tv = W[6];
Tx = W[7];
Tz = FMA(Tv, Tw, Tx * Ty);
T1b = FNMS(Tx, Tw, Tv * Ty);
}
{
E TM, TO, TL, TN;
TM = ri[WS(rs, 1)];
TO = ii[WS(rs, 1)];
TL = W[0];
TN = W[1];
TP = FMA(TL, TM, TN * TO);
T1f = FNMS(TN, TM, TL * TO);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 9)];
TD = ii[WS(rs, 9)];
TA = W[16];
TC = W[17];
TE = FMA(TA, TB, TC * TD);
T1c = FNMS(TC, TB, TA * TD);
}
{
E TH, TJ, TG, TI;
TH = ri[WS(rs, 6)];
TJ = ii[WS(rs, 6)];
TG = W[10];
TI = W[11];
TK = FMA(TG, TH, TI * TJ);
T1e = FNMS(TI, TH, TG * TJ);
}
TF = Tz - TE;
TQ = TK - TP;
TR = TF + TQ;
T1o = T1b + T1c;
T1p = T1e + T1f;
T1y = T1o + T1p;
TX = Tz + TE;
TY = TK + TP;
TZ = TX + TY;
T1d = T1b - T1c;
T1g = T1e - T1f;
T1M = T1d + T1g;
}
{
E Tc, T14, Ts, T18, Th, T15, Tn, T17;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 2)];
Tb = ii[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
T14 = FNMS(Ta, T9, T8 * Tb);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 3)];
Tr = ii[WS(rs, 3)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
T18 = FNMS(Tq, Tp, To * Tr);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 7)];
Tg = ii[WS(rs, 7)];
Td = W[12];
Tf = W[13];
Th = FMA(Td, Te, Tf * Tg);
T15 = FNMS(Tf, Te, Td * Tg);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 8)];
Tm = ii[WS(rs, 8)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T17 = FNMS(Tl, Tk, Tj * Tm);
}
Ti = Tc - Th;
Tt = Tn - Ts;
Tu = Ti + Tt;
T1r = T14 + T15;
T1s = T17 + T18;
T1x = T1r + T1s;
TU = Tc + Th;
TV = Tn + Ts;
TW = TU + TV;
T16 = T14 - T15;
T19 = T17 - T18;
T1L = T16 + T19;
}
{
E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
T11 = KP559016994 * (Tu - TR);
TS = Tu + TR;
T12 = FNMS(KP250000000, TS, T7);
T1a = T16 - T19;
T1h = T1d - T1g;
T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
ri[WS(rs, 5)] = T7 + TS;
T1j = T12 - T11;
ri[WS(rs, 7)] = T1j - T1k;
ri[WS(rs, 3)] = T1j + T1k;
T13 = T11 + T12;
ri[WS(rs, 9)] = T13 - T1i;
ri[WS(rs, 1)] = T13 + T1i;
}
{
E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R;
T1N = KP559016994 * (T1L - T1M);
T1P = T1L + T1M;
T1Q = FNMS(KP250000000, T1P, T1O);
T1S = Ti - Tt;
T1T = TF - TQ;
T1U = FMA(KP951056516, T1S, KP587785252 * T1T);
T1W = FNMS(KP587785252, T1S, KP951056516 * T1T);
ii[WS(rs, 5)] = T1P + T1O;
T1V = T1Q - T1N;
ii[WS(rs, 3)] = T1V - T1W;
ii[WS(rs, 7)] = T1W + T1V;
T1R = T1N + T1Q;
ii[WS(rs, 1)] = T1R - T1U;
ii[WS(rs, 9)] = T1U + T1R;
}
{
E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
T1m = KP559016994 * (TW - TZ);
T10 = TW + TZ;
T1l = FNMS(KP250000000, T10, TT);
T1q = T1o - T1p;
T1t = T1r - T1s;
T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
ri[0] = TT + T10;
T1v = T1m + T1l;
ri[WS(rs, 4)] = T1v - T1w;
ri[WS(rs, 6)] = T1v + T1w;
T1n = T1l - T1m;
ri[WS(rs, 2)] = T1n - T1u;
ri[WS(rs, 8)] = T1n + T1u;
}
{
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
T1H = KP559016994 * (T1x - T1y);
T1z = T1x + T1y;
T1G = FNMS(KP250000000, T1z, T1C);
T1D = TX - TY;
T1E = TU - TV;
T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
ii[0] = T1z + T1C;
T1K = T1H + T1G;
ii[WS(rs, 4)] = T1J + T1K;
ii[WS(rs, 6)] = T1K - T1J;
T1I = T1G - T1H;
ii[WS(rs, 2)] = T1F + T1I;
ii[WS(rs, 8)] = T1I - T1F;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 10 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 72, 30, 30, 0 }, 0, 0, 0 };
void X(codelet_t1_10) (planner *p) {
X(kdft_dit_register) (p, t1_10, &desc);
}
#endif
+581
View File
@@ -0,0 +1,581 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
/*
* This function contains 118 FP additions, 68 FP multiplications,
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2r, T1s, T2f, T1d, T21, T1H;
E T1Z, Te, T2o, T1l, T2h, TT, T1V, T1A, T1T;
T1 = ri[0];
T2i = ii[0];
{
E Th, Tk, Ti, T2d, Tg, Tj;
Th = ri[WS(rs, 6)];
Tk = ii[WS(rs, 6)];
Tg = W[10];
Ti = Tg * Th;
T2d = Tg * Tk;
Tj = W[11];
Tl = FMA(Tj, Tk, Ti);
T2e = FNMS(Tj, Th, T2d);
}
{
E TW, TZ, TX, T1X, TV, TY;
TW = ri[WS(rs, 9)];
TZ = ii[WS(rs, 9)];
TV = W[16];
TX = TV * TW;
T1X = TV * TZ;
TY = W[17];
T10 = FMA(TY, TZ, TX);
T1Y = FNMS(TY, TW, T1X);
}
{
E TC, TF, TD, T1R, TB, TE;
TC = ri[WS(rs, 3)];
TF = ii[WS(rs, 3)];
TB = W[4];
TD = TB * TC;
T1R = TB * TF;
TE = W[5];
TG = FMA(TE, TF, TD);
T1S = FNMS(TE, TC, T1R);
}
{
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
Tn = ri[WS(rs, 10)];
Tq = ii[WS(rs, 10)];
Tm = W[18];
To = Tm * Tn;
T1o = Tm * Tq;
Tt = ri[WS(rs, 2)];
Tw = ii[WS(rs, 2)];
Ts = W[2];
Tu = Ts * Tt;
T1q = Ts * Tw;
{
E Tr, T1p, Tx, T1r, Tp, Tv;
Tp = W[19];
Tr = FMA(Tp, Tq, To);
T1p = FNMS(Tp, Tn, T1o);
Tv = W[3];
Tx = FMA(Tv, Tw, Tu);
T1r = FNMS(Tv, Tt, T1q);
Ty = Tr + Tx;
T2r = Tx - Tr;
T1s = T1p - T1r;
T2f = T1p + T1r;
}
}
{
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
T12 = ri[WS(rs, 1)];
T15 = ii[WS(rs, 1)];
T11 = W[0];
T13 = T11 * T12;
T1D = T11 * T15;
T18 = ri[WS(rs, 5)];
T1b = ii[WS(rs, 5)];
T17 = W[8];
T19 = T17 * T18;
T1F = T17 * T1b;
{
E T16, T1E, T1c, T1G, T14, T1a;
T14 = W[1];
T16 = FMA(T14, T15, T13);
T1E = FNMS(T14, T12, T1D);
T1a = W[9];
T1c = FMA(T1a, T1b, T19);
T1G = FNMS(T1a, T18, T1F);
T1d = T16 + T1c;
T21 = T1c - T16;
T1H = T1E - T1G;
T1Z = T1E + T1G;
}
}
{
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
T3 = ri[WS(rs, 4)];
T6 = ii[WS(rs, 4)];
T2 = W[6];
T4 = T2 * T3;
T1h = T2 * T6;
T9 = ri[WS(rs, 8)];
Tc = ii[WS(rs, 8)];
T8 = W[14];
Ta = T8 * T9;
T1j = T8 * Tc;
{
E T7, T1i, Td, T1k, T5, Tb;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1i = FNMS(T5, T3, T1h);
Tb = W[15];
Td = FMA(Tb, Tc, Ta);
T1k = FNMS(Tb, T9, T1j);
Te = T7 + Td;
T2o = Td - T7;
T1l = T1i - T1k;
T2h = T1i + T1k;
}
}
{
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
TI = ri[WS(rs, 7)];
TL = ii[WS(rs, 7)];
TH = W[12];
TJ = TH * TI;
T1w = TH * TL;
TO = ri[WS(rs, 11)];
TR = ii[WS(rs, 11)];
TN = W[20];
TP = TN * TO;
T1y = TN * TR;
{
E TM, T1x, TS, T1z, TK, TQ;
TK = W[13];
TM = FMA(TK, TL, TJ);
T1x = FNMS(TK, TI, T1w);
TQ = W[21];
TS = FMA(TQ, TR, TP);
T1z = FNMS(TQ, TO, T1y);
TT = TM + TS;
T1V = TS - TM;
T1A = T1x - T1z;
T1T = T1x + T1z;
}
}
{
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
{
E Tf, Tz, T2g, T2j;
Tf = T1 + Te;
Tz = Tl + Ty;
TA = Tf + Tz;
T28 = Tf - Tz;
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2m = T2j - T2g;
}
{
E TU, T1e, T29, T2a;
TU = TG + TT;
T1e = T10 + T1d;
T1f = TU + T1e;
T2l = TU - T1e;
T29 = T1S + T1T;
T2a = T1Y + T1Z;
T2b = T29 - T2a;
T2c = T29 + T2a;
}
ri[WS(rs, 6)] = TA - T1f;
ii[WS(rs, 6)] = T2k - T2c;
ri[0] = TA + T1f;
ii[0] = T2c + T2k;
ri[WS(rs, 3)] = T28 - T2b;
ii[WS(rs, 3)] = T2l + T2m;
ri[WS(rs, 9)] = T28 + T2b;
ii[WS(rs, 9)] = T2m - T2l;
}
{
E T1m, T1K, T2p, T2y, T2s, T2x, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
E T1O;
{
E T1g, T2n, T2q, T1n;
T1g = FNMS(KP500000000, Te, T1);
T1m = FNMS(KP866025403, T1l, T1g);
T1K = FMA(KP866025403, T1l, T1g);
T2n = FNMS(KP500000000, T2h, T2i);
T2p = FMA(KP866025403, T2o, T2n);
T2y = FNMS(KP866025403, T2o, T2n);
T2q = FNMS(KP500000000, T2f, T2e);
T2s = FMA(KP866025403, T2r, T2q);
T2x = FNMS(KP866025403, T2r, T2q);
T1n = FNMS(KP500000000, Ty, Tl);
T1t = FNMS(KP866025403, T1s, T1n);
T1L = FMA(KP866025403, T1s, T1n);
}
{
E T1v, T1U, T20, T1C;
T1v = FNMS(KP500000000, TT, TG);
T1B = FNMS(KP866025403, T1A, T1v);
T1N = FMA(KP866025403, T1A, T1v);
T1U = FNMS(KP500000000, T1T, T1S);
T1W = FMA(KP866025403, T1V, T1U);
T25 = FNMS(KP866025403, T1V, T1U);
T20 = FNMS(KP500000000, T1Z, T1Y);
T22 = FMA(KP866025403, T21, T20);
T26 = FNMS(KP866025403, T21, T20);
T1C = FNMS(KP500000000, T1d, T10);
T1I = FNMS(KP866025403, T1H, T1C);
T1O = FMA(KP866025403, T1H, T1C);
}
{
E T1u, T1J, T2z, T2A;
T1u = T1m + T1t;
T1J = T1B + T1I;
ri[WS(rs, 2)] = T1u - T1J;
ri[WS(rs, 8)] = T1u + T1J;
T2z = T2x + T2y;
T2A = T25 + T26;
ii[WS(rs, 2)] = T2z - T2A;
ii[WS(rs, 8)] = T2A + T2z;
}
{
E T1M, T1P, T2v, T2w;
T1M = T1K + T1L;
T1P = T1N + T1O;
ri[WS(rs, 10)] = T1M - T1P;
ri[WS(rs, 4)] = T1M + T1P;
T2v = T1W + T22;
T2w = T2s + T2p;
ii[WS(rs, 4)] = T2v + T2w;
ii[WS(rs, 10)] = T2w - T2v;
}
{
E T1Q, T23, T2t, T2u;
T1Q = T1K - T1L;
T23 = T1W - T22;
ri[WS(rs, 7)] = T1Q - T23;
ri[WS(rs, 1)] = T1Q + T23;
T2t = T2p - T2s;
T2u = T1N - T1O;
ii[WS(rs, 1)] = T2t - T2u;
ii[WS(rs, 7)] = T2u + T2t;
}
{
E T24, T27, T2B, T2C;
T24 = T1m - T1t;
T27 = T25 - T26;
ri[WS(rs, 11)] = T24 - T27;
ri[WS(rs, 5)] = T24 + T27;
T2B = T2y - T2x;
T2C = T1B - T1I;
ii[WS(rs, 5)] = T2B - T2C;
ii[WS(rs, 11)] = T2C + T2B;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 12 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 72, 22, 46, 0 }, 0, 0, 0 };
void X(codelet_t1_12) (planner *p) {
X(kdft_dit_register) (p, t1_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
/*
* This function contains 118 FP additions, 60 FP multiplications,
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g;
E T1A, T1B;
{
E T6, T16, Tb, T17;
T1 = ri[0];
T1W = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 4)];
T5 = ii[WS(rs, 4)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T16 = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 8)];
Ta = ii[WS(rs, 8)];
T7 = W[14];
T9 = W[15];
Tb = FMA(T7, T8, T9 * Ta);
T17 = FNMS(T9, T8, T7 * Ta);
}
T18 = KP866025403 * (T16 - T17);
T21 = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
T15 = FNMS(KP500000000, Tc, T1);
T1V = T16 + T17;
T22 = FNMS(KP500000000, T1V, T1W);
}
{
E T11, T1n, TW, T1m;
{
E TO, TQ, TN, TP;
TO = ri[WS(rs, 9)];
TQ = ii[WS(rs, 9)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1E = FNMS(TP, TO, TN * TQ);
}
{
E TY, T10, TX, TZ;
TY = ri[WS(rs, 5)];
T10 = ii[WS(rs, 5)];
TX = W[8];
TZ = W[9];
T11 = FMA(TX, TY, TZ * T10);
T1n = FNMS(TZ, TY, TX * T10);
}
{
E TT, TV, TS, TU;
TT = ri[WS(rs, 1)];
TV = ii[WS(rs, 1)];
TS = W[0];
TU = W[1];
TW = FMA(TS, TT, TU * TV);
T1m = FNMS(TU, TT, TS * TV);
}
T1o = KP866025403 * (T1m - T1n);
T1D = KP866025403 * (T11 - TW);
T12 = TW + T11;
T1l = FNMS(KP500000000, T12, TR);
T1F = T1m + T1n;
T1G = FNMS(KP500000000, T1F, T1E);
}
{
E Ts, T1c, Tn, T1b;
{
E Tf, Th, Te, Tg;
Tf = ri[WS(rs, 6)];
Th = ii[WS(rs, 6)];
Te = W[10];
Tg = W[11];
Ti = FMA(Te, Tf, Tg * Th);
T1S = FNMS(Tg, Tf, Te * Th);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 2)];
Tr = ii[WS(rs, 2)];
To = W[2];
Tq = W[3];
Ts = FMA(To, Tp, Tq * Tr);
T1c = FNMS(Tq, Tp, To * Tr);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 10)];
Tm = ii[WS(rs, 10)];
Tj = W[18];
Tl = W[19];
Tn = FMA(Tj, Tk, Tl * Tm);
T1b = FNMS(Tl, Tk, Tj * Tm);
}
T1d = KP866025403 * (T1b - T1c);
T24 = KP866025403 * (Ts - Tn);
Tt = Tn + Ts;
T1a = FNMS(KP500000000, Tt, Ti);
T1T = T1b + T1c;
T25 = FNMS(KP500000000, T1T, T1S);
}
{
E TK, T1i, TF, T1h;
{
E Tx, Tz, Tw, Ty;
Tx = ri[WS(rs, 3)];
Tz = ii[WS(rs, 3)];
Tw = W[4];
Ty = W[5];
TA = FMA(Tw, Tx, Ty * Tz);
T1z = FNMS(Ty, Tx, Tw * Tz);
}
{
E TH, TJ, TG, TI;
TH = ri[WS(rs, 11)];
TJ = ii[WS(rs, 11)];
TG = W[20];
TI = W[21];
TK = FMA(TG, TH, TI * TJ);
T1i = FNMS(TI, TH, TG * TJ);
}
{
E TC, TE, TB, TD;
TC = ri[WS(rs, 7)];
TE = ii[WS(rs, 7)];
TB = W[12];
TD = W[13];
TF = FMA(TB, TC, TD * TE);
T1h = FNMS(TD, TC, TB * TE);
}
T1j = KP866025403 * (T1h - T1i);
T1y = KP866025403 * (TK - TF);
TL = TF + TK;
T1g = FNMS(KP500000000, TL, TA);
T1A = T1h + T1i;
T1B = FNMS(KP500000000, T1A, T1z);
}
{
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
{
E Td, Tu, T1U, T1X;
Td = T1 + Tc;
Tu = Ti + Tt;
Tv = Td + Tu;
T1N = Td - Tu;
T1U = T1S + T1T;
T1X = T1V + T1W;
T1Y = T1U + T1X;
T20 = T1X - T1U;
}
{
E TM, T13, T1O, T1P;
TM = TA + TL;
T13 = TR + T12;
T14 = TM + T13;
T1Z = TM - T13;
T1O = T1z + T1A;
T1P = T1E + T1F;
T1Q = T1O - T1P;
T1R = T1O + T1P;
}
ri[WS(rs, 6)] = Tv - T14;
ii[WS(rs, 6)] = T1Y - T1R;
ri[0] = Tv + T14;
ii[0] = T1R + T1Y;
ri[WS(rs, 3)] = T1N - T1Q;
ii[WS(rs, 3)] = T1Z + T20;
ri[WS(rs, 9)] = T1N + T1Q;
ii[WS(rs, 9)] = T20 - T1Z;
}
{
E T1t, T1x, T27, T2a, T1w, T28, T1I, T29;
{
E T1r, T1s, T23, T26;
T1r = T15 + T18;
T1s = T1a + T1d;
T1t = T1r + T1s;
T1x = T1r - T1s;
T23 = T21 + T22;
T26 = T24 + T25;
T27 = T23 - T26;
T2a = T26 + T23;
}
{
E T1u, T1v, T1C, T1H;
T1u = T1g + T1j;
T1v = T1l + T1o;
T1w = T1u + T1v;
T28 = T1u - T1v;
T1C = T1y + T1B;
T1H = T1D + T1G;
T1I = T1C - T1H;
T29 = T1C + T1H;
}
ri[WS(rs, 10)] = T1t - T1w;
ii[WS(rs, 10)] = T2a - T29;
ri[WS(rs, 4)] = T1t + T1w;
ii[WS(rs, 4)] = T29 + T2a;
ri[WS(rs, 7)] = T1x - T1I;
ii[WS(rs, 7)] = T28 + T27;
ri[WS(rs, 1)] = T1x + T1I;
ii[WS(rs, 1)] = T27 - T28;
}
{
E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e;
{
E T19, T1e, T2b, T2c;
T19 = T15 - T18;
T1e = T1a - T1d;
T1f = T19 + T1e;
T1J = T19 - T1e;
T2b = T25 - T24;
T2c = T22 - T21;
T2d = T2b + T2c;
T2f = T2c - T2b;
}
{
E T1k, T1p, T1K, T1L;
T1k = T1g - T1j;
T1p = T1l - T1o;
T1q = T1k + T1p;
T2g = T1k - T1p;
T1K = T1B - T1y;
T1L = T1G - T1D;
T1M = T1K - T1L;
T2e = T1K + T1L;
}
ri[WS(rs, 2)] = T1f - T1q;
ii[WS(rs, 2)] = T2d - T2e;
ri[WS(rs, 8)] = T1f + T1q;
ii[WS(rs, 8)] = T2e + T2d;
ri[WS(rs, 11)] = T1J - T1M;
ii[WS(rs, 11)] = T2g + T2f;
ri[WS(rs, 5)] = T1J + T1M;
ii[WS(rs, 5)] = T2f - T2g;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 12 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 88, 30, 30, 0 }, 0, 0, 0 };
void X(codelet_t1_12) (planner *p) {
X(kdft_dit_register) (p, t1_12, &desc);
}
#endif
+816
View File
@@ -0,0 +1,816 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
/*
* This function contains 184 FP additions, 140 FP multiplications,
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
* 51 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T1, T3j, T1G, T3u, Te, T1B, T3i, T3t, T1y, T2i, T2a, T2M, T37, T2V, Tz;
E T2e, T1O, T2t, T39, T2X, TT, T2f, T1V, T2z, T3a, T2Y, T1e, T2h, T23, T2G;
E T36, T2U;
{
E T7, T1D, Td, T1F;
T1 = ri[0];
T3j = ii[0];
{
E T3, T6, T4, T1C, T2, T5;
T3 = ri[WS(rs, 5)];
T6 = ii[WS(rs, 5)];
T2 = W[8];
T4 = T2 * T3;
T1C = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1D = FNMS(T5, T3, T1C);
}
{
E T9, Tc, Ta, T1E, T8, Tb;
T9 = ri[WS(rs, 10)];
Tc = ii[WS(rs, 10)];
T8 = W[18];
Ta = T8 * T9;
T1E = T8 * Tc;
Tb = W[19];
Td = FMA(Tb, Tc, Ta);
T1F = FNMS(Tb, T9, T1E);
}
T1G = T1D - T1F;
T3u = Td - T7;
Te = T7 + Td;
T1B = FNMS(KP500000000, Te, T1);
T3i = T1D + T1F;
T3t = FNMS(KP500000000, T3i, T3j);
}
{
E T1k, T2I, T1w, T28, T1q, T26;
{
E T1g, T1j, T1h, T2H, T1f, T1i;
T1g = ri[WS(rs, 9)];
T1j = ii[WS(rs, 9)];
T1f = W[16];
T1h = T1f * T1g;
T2H = T1f * T1j;
T1i = W[17];
T1k = FMA(T1i, T1j, T1h);
T2I = FNMS(T1i, T1g, T2H);
}
{
E T1s, T1v, T1t, T27, T1r, T1u;
T1s = ri[WS(rs, 4)];
T1v = ii[WS(rs, 4)];
T1r = W[6];
T1t = T1r * T1s;
T27 = T1r * T1v;
T1u = W[7];
T1w = FMA(T1u, T1v, T1t);
T28 = FNMS(T1u, T1s, T27);
}
{
E T1m, T1p, T1n, T25, T1l, T1o;
T1m = ri[WS(rs, 14)];
T1p = ii[WS(rs, 14)];
T1l = W[26];
T1n = T1l * T1m;
T25 = T1l * T1p;
T1o = W[27];
T1q = FMA(T1o, T1p, T1n);
T26 = FNMS(T1o, T1m, T25);
}
{
E T29, T1x, T24, T2L, T2J, T2K;
T29 = T26 - T28;
T1x = T1q + T1w;
T24 = FNMS(KP500000000, T1x, T1k);
T1y = T1k + T1x;
T2i = FMA(KP866025403, T29, T24);
T2a = FNMS(KP866025403, T29, T24);
T2L = T1w - T1q;
T2J = T26 + T28;
T2K = FNMS(KP500000000, T2J, T2I);
T2M = FMA(KP866025403, T2L, T2K);
T37 = T2I + T2J;
T2V = FNMS(KP866025403, T2L, T2K);
}
}
{
E Tl, T2p, Tx, T1M, Tr, T1K;
{
E Th, Tk, Ti, T2o, Tg, Tj;
Th = ri[WS(rs, 3)];
Tk = ii[WS(rs, 3)];
Tg = W[4];
Ti = Tg * Th;
T2o = Tg * Tk;
Tj = W[5];
Tl = FMA(Tj, Tk, Ti);
T2p = FNMS(Tj, Th, T2o);
}
{
E Tt, Tw, Tu, T1L, Ts, Tv;
Tt = ri[WS(rs, 13)];
Tw = ii[WS(rs, 13)];
Ts = W[24];
Tu = Ts * Tt;
T1L = Ts * Tw;
Tv = W[25];
Tx = FMA(Tv, Tw, Tu);
T1M = FNMS(Tv, Tt, T1L);
}
{
E Tn, Tq, To, T1J, Tm, Tp;
Tn = ri[WS(rs, 8)];
Tq = ii[WS(rs, 8)];
Tm = W[14];
To = Tm * Tn;
T1J = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1K = FNMS(Tp, Tn, T1J);
}
{
E T1N, Ty, T1I, T2s, T2q, T2r;
T1N = T1K - T1M;
Ty = Tr + Tx;
T1I = FNMS(KP500000000, Ty, Tl);
Tz = Tl + Ty;
T2e = FMA(KP866025403, T1N, T1I);
T1O = FNMS(KP866025403, T1N, T1I);
T2s = Tx - Tr;
T2q = T1K + T1M;
T2r = FNMS(KP500000000, T2q, T2p);
T2t = FMA(KP866025403, T2s, T2r);
T39 = T2p + T2q;
T2X = FNMS(KP866025403, T2s, T2r);
}
}
{
E TF, T2v, TR, T1T, TL, T1R;
{
E TB, TE, TC, T2u, TA, TD;
TB = ri[WS(rs, 12)];
TE = ii[WS(rs, 12)];
TA = W[22];
TC = TA * TB;
T2u = TA * TE;
TD = W[23];
TF = FMA(TD, TE, TC);
T2v = FNMS(TD, TB, T2u);
}
{
E TN, TQ, TO, T1S, TM, TP;
TN = ri[WS(rs, 7)];
TQ = ii[WS(rs, 7)];
TM = W[12];
TO = TM * TN;
T1S = TM * TQ;
TP = W[13];
TR = FMA(TP, TQ, TO);
T1T = FNMS(TP, TN, T1S);
}
{
E TH, TK, TI, T1Q, TG, TJ;
TH = ri[WS(rs, 2)];
TK = ii[WS(rs, 2)];
TG = W[2];
TI = TG * TH;
T1Q = TG * TK;
TJ = W[3];
TL = FMA(TJ, TK, TI);
T1R = FNMS(TJ, TH, T1Q);
}
{
E T1U, TS, T1P, T2y, T2w, T2x;
T1U = T1R - T1T;
TS = TL + TR;
T1P = FNMS(KP500000000, TS, TF);
TT = TF + TS;
T2f = FMA(KP866025403, T1U, T1P);
T1V = FNMS(KP866025403, T1U, T1P);
T2y = TR - TL;
T2w = T1R + T1T;
T2x = FNMS(KP500000000, T2w, T2v);
T2z = FMA(KP866025403, T2y, T2x);
T3a = T2v + T2w;
T2Y = FNMS(KP866025403, T2y, T2x);
}
}
{
E T10, T2C, T1c, T21, T16, T1Z;
{
E TW, TZ, TX, T2B, TV, TY;
TW = ri[WS(rs, 6)];
TZ = ii[WS(rs, 6)];
TV = W[10];
TX = TV * TW;
T2B = TV * TZ;
TY = W[11];
T10 = FMA(TY, TZ, TX);
T2C = FNMS(TY, TW, T2B);
}
{
E T18, T1b, T19, T20, T17, T1a;
T18 = ri[WS(rs, 1)];
T1b = ii[WS(rs, 1)];
T17 = W[0];
T19 = T17 * T18;
T20 = T17 * T1b;
T1a = W[1];
T1c = FMA(T1a, T1b, T19);
T21 = FNMS(T1a, T18, T20);
}
{
E T12, T15, T13, T1Y, T11, T14;
T12 = ri[WS(rs, 11)];
T15 = ii[WS(rs, 11)];
T11 = W[20];
T13 = T11 * T12;
T1Y = T11 * T15;
T14 = W[21];
T16 = FMA(T14, T15, T13);
T1Z = FNMS(T14, T12, T1Y);
}
{
E T22, T1d, T1X, T2F, T2D, T2E;
T22 = T1Z - T21;
T1d = T16 + T1c;
T1X = FNMS(KP500000000, T1d, T10);
T1e = T10 + T1d;
T2h = FMA(KP866025403, T22, T1X);
T23 = FNMS(KP866025403, T22, T1X);
T2F = T1c - T16;
T2D = T1Z + T21;
T2E = FNMS(KP500000000, T2D, T2C);
T2G = FMA(KP866025403, T2F, T2E);
T36 = T2C + T2D;
T2U = FNMS(KP866025403, T2F, T2E);
}
}
{
E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
{
E T38, T3b, TU, T1z;
T38 = T36 - T37;
T3b = T39 - T3a;
T3c = FNMS(KP618033988, T3b, T38);
T3e = FMA(KP618033988, T38, T3b);
Tf = T1 + Te;
TU = Tz + TT;
T1z = T1e + T1y;
T1A = TU + T1z;
T33 = FNMS(KP250000000, T1A, Tf);
T34 = TU - T1z;
}
ri[0] = Tf + T1A;
T3d = FMA(KP559016994, T34, T33);
ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
T35 = FNMS(KP559016994, T34, T33);
ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
}
{
E T3q, T3s, T3k, T3h, T3l, T3m, T3r, T3n;
{
E T3o, T3p, T3f, T3g;
T3o = T1e - T1y;
T3p = Tz - TT;
T3q = FNMS(KP618033988, T3p, T3o);
T3s = FMA(KP618033988, T3o, T3p);
T3k = T3i + T3j;
T3f = T39 + T3a;
T3g = T36 + T37;
T3h = T3f + T3g;
T3l = FNMS(KP250000000, T3h, T3k);
T3m = T3f - T3g;
}
ii[0] = T3h + T3k;
T3r = FMA(KP559016994, T3m, T3l);
ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
T3n = FNMS(KP559016994, T3m, T3l);
ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
}
{
E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
{
E T2W, T2Z, T1W, T2b;
T2W = T2U - T2V;
T2Z = T2X - T2Y;
T30 = FNMS(KP618033988, T2Z, T2W);
T32 = FMA(KP618033988, T2W, T2Z);
T1H = FNMS(KP866025403, T1G, T1B);
T1W = T1O + T1V;
T2b = T23 + T2a;
T2c = T1W + T2b;
T2R = FNMS(KP250000000, T2c, T1H);
T2S = T1W - T2b;
}
ri[WS(rs, 5)] = T1H + T2c;
T31 = FMA(KP559016994, T2S, T2R);
ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
T2T = FNMS(KP559016994, T2S, T2R);
ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
}
{
E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
{
E T3O, T3P, T3I, T3J;
T3O = T23 - T2a;
T3P = T1O - T1V;
T3Q = FNMS(KP618033988, T3P, T3O);
T3S = FMA(KP618033988, T3O, T3P);
T3H = FNMS(KP866025403, T3u, T3t);
T3I = T2X + T2Y;
T3J = T2U + T2V;
T3K = T3I + T3J;
T3L = FNMS(KP250000000, T3K, T3H);
T3M = T3I - T3J;
}
ii[WS(rs, 5)] = T3K + T3H;
T3R = FMA(KP559016994, T3M, T3L);
ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
T3N = FNMS(KP559016994, T3M, T3L);
ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
}
{
E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
{
E T3C, T3D, T3w, T3x;
T3C = T2e - T2f;
T3D = T2h - T2i;
T3E = FMA(KP618033988, T3D, T3C);
T3G = FNMS(KP618033988, T3C, T3D);
T3v = FMA(KP866025403, T3u, T3t);
T3w = T2t + T2z;
T3x = T2G + T2M;
T3y = T3w + T3x;
T3z = FNMS(KP250000000, T3y, T3v);
T3A = T3w - T3x;
}
ii[WS(rs, 10)] = T3y + T3v;
T3F = FNMS(KP559016994, T3A, T3z);
ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
T3B = FMA(KP559016994, T3A, T3z);
ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
}
{
E T2O, T2Q, T2d, T2k, T2l, T2m, T2P, T2n;
{
E T2A, T2N, T2g, T2j;
T2A = T2t - T2z;
T2N = T2G - T2M;
T2O = FMA(KP618033988, T2N, T2A);
T2Q = FNMS(KP618033988, T2A, T2N);
T2d = FMA(KP866025403, T1G, T1B);
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2l = FNMS(KP250000000, T2k, T2d);
T2m = T2g - T2j;
}
ri[WS(rs, 10)] = T2d + T2k;
T2P = FNMS(KP559016994, T2m, T2l);
ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
T2n = FMA(KP559016994, T2m, T2l);
ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 72, 28, 112, 0 }, 0, 0, 0 };
void X(codelet_t1_15) (planner *p) {
X(kdft_dit_register) (p, t1_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
/*
* This function contains 184 FP additions, 112 FP multiplications,
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
* 65 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
E T24, T2v, T1B, T1R;
{
E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
T1 = ri[0];
T2R = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 5)];
T5 = ii[WS(rs, 5)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1o = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 10)];
Ta = ii[WS(rs, 10)];
T7 = W[18];
T9 = W[19];
Tb = FMA(T7, T8, T9 * Ta);
T1p = FNMS(T9, T8, T7 * Ta);
}
T1q = KP866025403 * (T1o - T1p);
T34 = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
Td = T1 + Tc;
T1n = FNMS(KP500000000, Tc, T1);
T2Q = T1o + T1p;
T2S = T2Q + T2R;
T35 = FNMS(KP500000000, T2Q, T2R);
}
{
E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
E T2i;
{
E TO, TQ, TN, TP;
TO = ri[WS(rs, 6)];
TQ = ii[WS(rs, 6)];
TN = W[10];
TP = W[11];
TR = FMA(TN, TO, TP * TQ);
T2c = FNMS(TP, TO, TN * TQ);
}
{
E T15, T17, T14, T16;
T15 = ri[WS(rs, 9)];
T17 = ii[WS(rs, 9)];
T14 = W[16];
T16 = W[17];
T18 = FMA(T14, T15, T16 * T17);
T2h = FNMS(T16, T15, T14 * T17);
}
{
E TT, TV, TS, TU;
TT = ri[WS(rs, 11)];
TV = ii[WS(rs, 11)];
TS = W[20];
TU = W[21];
TW = FMA(TS, TT, TU * TV);
T1E = FNMS(TU, TT, TS * TV);
}
{
E TY, T10, TX, TZ;
TY = ri[WS(rs, 1)];
T10 = ii[WS(rs, 1)];
TX = W[0];
TZ = W[1];
T11 = FMA(TX, TY, TZ * T10);
T1F = FNMS(TZ, TY, TX * T10);
}
T12 = TW + T11;
T2d = T1E + T1F;
{
E T1a, T1c, T19, T1b;
T1a = ri[WS(rs, 14)];
T1c = ii[WS(rs, 14)];
T19 = W[26];
T1b = W[27];
T1d = FMA(T19, T1a, T1b * T1c);
T1J = FNMS(T1b, T1a, T19 * T1c);
}
{
E T1f, T1h, T1e, T1g;
T1f = ri[WS(rs, 4)];
T1h = ii[WS(rs, 4)];
T1e = W[6];
T1g = W[7];
T1i = FMA(T1e, T1f, T1g * T1h);
T1K = FNMS(T1g, T1f, T1e * T1h);
}
T1j = T1d + T1i;
T2i = T1J + T1K;
{
E T1D, T1G, T2g, T2j;
T13 = TR + T12;
T1k = T18 + T1j;
T1l = T13 + T1k;
T2E = T2c + T2d;
T2F = T2h + T2i;
T2O = T2E + T2F;
T1D = FNMS(KP500000000, T12, TR);
T1G = KP866025403 * (T1E - T1F);
T1H = T1D - T1G;
T1T = T1D + T1G;
T2g = KP866025403 * (T1i - T1d);
T2j = FNMS(KP500000000, T2i, T2h);
T2k = T2g + T2j;
T2t = T2j - T2g;
{
E T2b, T2e, T1I, T1L;
T2b = KP866025403 * (T11 - TW);
T2e = FNMS(KP500000000, T2d, T2c);
T2f = T2b + T2e;
T2s = T2e - T2b;
T1I = FNMS(KP500000000, T1j, T18);
T1L = KP866025403 * (T1J - T1K);
T1M = T1I - T1L;
T1U = T1I + T1L;
}
}
}
{
E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
E T27;
{
E Tf, Th, Te, Tg;
Tf = ri[WS(rs, 3)];
Th = ii[WS(rs, 3)];
Te = W[4];
Tg = W[5];
Ti = FMA(Te, Tf, Tg * Th);
T21 = FNMS(Tg, Tf, Te * Th);
}
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 12)];
Ty = ii[WS(rs, 12)];
Tv = W[22];
Tx = W[23];
Tz = FMA(Tv, Tw, Tx * Ty);
T26 = FNMS(Tx, Tw, Tv * Ty);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 8)];
Tm = ii[WS(rs, 8)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T1t = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 13)];
Tr = ii[WS(rs, 13)];
To = W[24];
Tq = W[25];
Ts = FMA(To, Tp, Tq * Tr);
T1u = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn + Ts;
T22 = T1t + T1u;
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 2)];
TD = ii[WS(rs, 2)];
TA = W[2];
TC = W[3];
TE = FMA(TA, TB, TC * TD);
T1y = FNMS(TC, TB, TA * TD);
}
{
E TG, TI, TF, TH;
TG = ri[WS(rs, 7)];
TI = ii[WS(rs, 7)];
TF = W[12];
TH = W[13];
TJ = FMA(TF, TG, TH * TI);
T1z = FNMS(TH, TG, TF * TI);
}
TK = TE + TJ;
T27 = T1y + T1z;
{
E T1s, T1v, T25, T28;
Tu = Ti + Tt;
TL = Tz + TK;
TM = Tu + TL;
T2H = T21 + T22;
T2I = T26 + T27;
T2N = T2H + T2I;
T1s = FNMS(KP500000000, Tt, Ti);
T1v = KP866025403 * (T1t - T1u);
T1w = T1s - T1v;
T1Q = T1s + T1v;
T25 = KP866025403 * (TJ - TE);
T28 = FNMS(KP500000000, T27, T26);
T29 = T25 + T28;
T2w = T28 - T25;
{
E T20, T23, T1x, T1A;
T20 = KP866025403 * (Ts - Tn);
T23 = FNMS(KP500000000, T22, T21);
T24 = T20 + T23;
T2v = T23 - T20;
T1x = FNMS(KP500000000, TK, Tz);
T1A = KP866025403 * (T1y - T1z);
T1B = T1x - T1A;
T1R = T1x + T1A;
}
}
}
{
E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
T2C = KP559016994 * (TM - T1l);
T1m = TM + T1l;
T2B = FNMS(KP250000000, T1m, Td);
T2G = T2E - T2F;
T2J = T2H - T2I;
T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
ri[0] = Td + T1m;
T2L = T2C + T2B;
ri[WS(rs, 9)] = T2L - T2M;
ri[WS(rs, 6)] = T2L + T2M;
T2D = T2B - T2C;
ri[WS(rs, 12)] = T2D - T2K;
ri[WS(rs, 3)] = T2D + T2K;
}
{
E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
T2U = KP559016994 * (T2N - T2O);
T2P = T2N + T2O;
T2T = FNMS(KP250000000, T2P, T2S);
T2W = T13 - T1k;
T2X = Tu - TL;
T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
ii[0] = T2P + T2S;
T2Z = T2U + T2T;
ii[WS(rs, 6)] = T2Z - T30;
ii[WS(rs, 9)] = T30 + T2Z;
T2V = T2T - T2U;
ii[WS(rs, 3)] = T2V - T2Y;
ii[WS(rs, 12)] = T2Y + T2V;
}
{
E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
{
E T2u, T2x, T1C, T1N;
T2u = T2s - T2t;
T2x = T2v - T2w;
T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
T1r = T1n - T1q;
T1C = T1w + T1B;
T1N = T1H + T1M;
T1O = T1C + T1N;
T2p = FNMS(KP250000000, T1O, T1r);
T2q = KP559016994 * (T1C - T1N);
}
ri[WS(rs, 5)] = T1r + T1O;
T2z = T2q + T2p;
ri[WS(rs, 14)] = T2z - T2A;
ri[WS(rs, 11)] = T2z + T2A;
T2r = T2p - T2q;
ri[WS(rs, 2)] = T2r - T2y;
ri[WS(rs, 8)] = T2r + T2y;
}
{
E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
{
E T3f, T3g, T3j, T3k;
T3f = T1H - T1M;
T3g = T1w - T1B;
T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
T3i = T35 - T34;
T3j = T2v + T2w;
T3k = T2s + T2t;
T3l = T3j + T3k;
T3m = FNMS(KP250000000, T3l, T3i);
T3n = KP559016994 * (T3j - T3k);
}
ii[WS(rs, 5)] = T3l + T3i;
T3p = T3n + T3m;
ii[WS(rs, 11)] = T3p - T3q;
ii[WS(rs, 14)] = T3q + T3p;
T3o = T3m - T3n;
ii[WS(rs, 2)] = T3h + T3o;
ii[WS(rs, 8)] = T3o - T3h;
}
{
E T3c, T3d, T36, T37, T33, T38, T3e, T39;
{
E T3a, T3b, T31, T32;
T3a = T1Q - T1R;
T3b = T1T - T1U;
T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
T36 = T34 + T35;
T31 = T24 + T29;
T32 = T2f + T2k;
T37 = T31 + T32;
T33 = KP559016994 * (T31 - T32);
T38 = FNMS(KP250000000, T37, T36);
}
ii[WS(rs, 10)] = T37 + T36;
T3e = T38 - T33;
ii[WS(rs, 7)] = T3d + T3e;
ii[WS(rs, 13)] = T3e - T3d;
T39 = T33 + T38;
ii[WS(rs, 1)] = T39 - T3c;
ii[WS(rs, 4)] = T3c + T39;
}
{
E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
{
E T2a, T2l, T1S, T1V;
T2a = T24 - T29;
T2l = T2f - T2k;
T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
T1P = T1n + T1q;
T1S = T1Q + T1R;
T1V = T1T + T1U;
T1W = T1S + T1V;
T1X = KP559016994 * (T1S - T1V);
T1Y = FNMS(KP250000000, T1W, T1P);
}
ri[WS(rs, 10)] = T1P + T1W;
T2n = T1Y - T1X;
ri[WS(rs, 7)] = T2n - T2o;
ri[WS(rs, 13)] = T2n + T2o;
T1Z = T1X + T1Y;
ri[WS(rs, 4)] = T1Z - T2m;
ri[WS(rs, 1)] = T1Z + T2m;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 128, 56, 56, 0 }, 0, 0, 0 };
void X(codelet_t1_15) (planner *p) {
X(kdft_dit_register) (p, t1_15, &desc);
}
#endif
+796
View File
@@ -0,0 +1,796 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
/*
* This function contains 174 FP additions, 100 FP multiplications,
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
* 60 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N;
E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
E T1W, T21;
{
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
T1 = ri[0];
T3n = ii[0];
T3 = ri[WS(rs, 8)];
T6 = ii[WS(rs, 8)];
T2 = W[14];
T4 = T2 * T3;
T3l = T2 * T6;
T5 = W[15];
T7 = FMA(T5, T6, T4);
T3m = FNMS(T5, T3, T3l);
T8 = T1 + T7;
T3z = T3n - T3m;
T1I = T1 - T7;
T3o = T3m + T3n;
}
{
E T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m;
T1h = ri[WS(rs, 15)];
T1k = ii[WS(rs, 15)];
T1g = W[28];
T1i = T1g * T1h;
T2k = T1g * T1k;
T1n = ri[WS(rs, 7)];
T1q = ii[WS(rs, 7)];
T1m = W[12];
T1o = T1m * T1n;
T2m = T1m * T1q;
{
E T1l, T2l, T1r, T2n, T1j, T1p;
T1j = W[29];
T1l = FMA(T1j, T1k, T1i);
T2l = FNMS(T1j, T1h, T2k);
T1p = W[13];
T1r = FMA(T1p, T1q, T1o);
T2n = FNMS(T1p, T1n, T2m);
T1s = T1l + T1r;
T35 = T2l + T2n;
T2o = T2l - T2n;
T2r = T1l - T1r;
}
}
{
E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
T1u = ri[WS(rs, 3)];
T1x = ii[WS(rs, 3)];
T1t = W[4];
T1v = T1t * T1u;
T2s = T1t * T1x;
T1A = ri[WS(rs, 11)];
T1D = ii[WS(rs, 11)];
T1z = W[20];
T1B = T1z * T1A;
T2u = T1z * T1D;
{
E T1y, T2t, T1E, T2v, T1w, T1C;
T1w = W[5];
T1y = FMA(T1w, T1x, T1v);
T2t = FNMS(T1w, T1u, T2s);
T1C = W[21];
T1E = FMA(T1C, T1D, T1B);
T2v = FNMS(T1C, T1A, T2u);
T1F = T1y + T1E;
T36 = T2t + T2v;
T2p = T1y - T1E;
T2w = T2t - T2v;
}
}
{
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
Ta = ri[WS(rs, 4)];
Td = ii[WS(rs, 4)];
T9 = W[6];
Tb = T9 * Ta;
T1J = T9 * Td;
Tg = ri[WS(rs, 12)];
Tj = ii[WS(rs, 12)];
Tf = W[22];
Th = Tf * Tg;
T1L = Tf * Tj;
{
E Te, T1K, Tk, T1M, Tc, Ti;
Tc = W[7];
Te = FMA(Tc, Td, Tb);
T1K = FNMS(Tc, Ta, T1J);
Ti = W[23];
Tk = FMA(Ti, Tj, Th);
T1M = FNMS(Ti, Tg, T1L);
Tl = Te + Tk;
T3A = Te - Tk;
T1N = T1K - T1M;
T3k = T1K + T1M;
}
}
{
E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
To = ri[WS(rs, 2)];
Tr = ii[WS(rs, 2)];
Tn = W[2];
Tp = Tn * To;
T1P = Tn * Tr;
Tu = ri[WS(rs, 10)];
Tx = ii[WS(rs, 10)];
Tt = W[18];
Tv = Tt * Tu;
T1R = Tt * Tx;
{
E Ts, T1Q, Ty, T1S, Tq, Tw;
Tq = W[3];
Ts = FMA(Tq, Tr, Tp);
T1Q = FNMS(Tq, To, T1P);
Tw = W[19];
Ty = FMA(Tw, Tx, Tv);
T1S = FNMS(Tw, Tu, T1R);
Tz = Ts + Ty;
T2V = T1Q + T1S;
T1T = T1Q - T1S;
T1U = Ts - Ty;
}
}
{
E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
TQ = ri[WS(rs, 1)];
TT = ii[WS(rs, 1)];
TP = W[0];
TR = TP * TQ;
T25 = TP * TT;
TW = ri[WS(rs, 9)];
TZ = ii[WS(rs, 9)];
TV = W[16];
TX = TV * TW;
T27 = TV * TZ;
{
E TU, T26, T10, T28, TS, TY;
TS = W[1];
TU = FMA(TS, TT, TR);
T26 = FNMS(TS, TQ, T25);
TY = W[17];
T10 = FMA(TY, TZ, TX);
T28 = FNMS(TY, TW, T27);
T11 = TU + T10;
T30 = T26 + T28;
T29 = T26 - T28;
T2c = TU - T10;
}
}
{
E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
T13 = ri[WS(rs, 5)];
T16 = ii[WS(rs, 5)];
T12 = W[8];
T14 = T12 * T13;
T2d = T12 * T16;
T19 = ri[WS(rs, 13)];
T1c = ii[WS(rs, 13)];
T18 = W[24];
T1a = T18 * T19;
T2f = T18 * T1c;
{
E T17, T2e, T1d, T2g, T15, T1b;
T15 = W[9];
T17 = FMA(T15, T16, T14);
T2e = FNMS(T15, T13, T2d);
T1b = W[25];
T1d = FMA(T1b, T1c, T1a);
T2g = FNMS(T1b, T19, T2f);
T1e = T17 + T1d;
T31 = T2e + T2g;
T2a = T17 - T1d;
T2h = T2e - T2g;
}
}
{
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
TB = ri[WS(rs, 14)];
TE = ii[WS(rs, 14)];
TA = W[26];
TC = TA * TB;
T1X = TA * TE;
TH = ri[WS(rs, 6)];
TK = ii[WS(rs, 6)];
TG = W[10];
TI = TG * TH;
T1Z = TG * TK;
{
E TF, T1Y, TL, T20, TD, TJ;
TD = W[27];
TF = FMA(TD, TE, TC);
T1Y = FNMS(TD, TB, T1X);
TJ = W[11];
TL = FMA(TJ, TK, TI);
T20 = FNMS(TJ, TH, T1Z);
TM = TF + TL;
T2W = T1Y + T20;
T1W = TF - TL;
T21 = T1Y - T20;
}
}
{
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
{
E Tm, TN, T3j, T3p;
Tm = T8 + Tl;
TN = Tz + TM;
TO = Tm + TN;
T3e = Tm - TN;
T3j = T2V + T2W;
T3p = T3k + T3o;
T3q = T3j + T3p;
T3s = T3p - T3j;
}
{
E T1f, T1G, T3f, T3g;
T1f = T11 + T1e;
T1G = T1s + T1F;
T1H = T1f + T1G;
T3r = T1G - T1f;
T3f = T30 + T31;
T3g = T35 + T36;
T3h = T3f - T3g;
T3i = T3f + T3g;
}
ri[WS(rs, 8)] = TO - T1H;
ii[WS(rs, 8)] = T3q - T3i;
ri[0] = TO + T1H;
ii[0] = T3i + T3q;
ri[WS(rs, 12)] = T3e - T3h;
ii[WS(rs, 12)] = T3s - T3r;
ri[WS(rs, 4)] = T3e + T3h;
ii[WS(rs, 4)] = T3r + T3s;
}
{
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
{
E T2U, T2X, T3t, T3u;
T2U = T8 - Tl;
T2X = T2V - T2W;
T2Y = T2U + T2X;
T3a = T2U - T2X;
T3t = TM - Tz;
T3u = T3o - T3k;
T3v = T3t + T3u;
T3x = T3u - T3t;
}
{
E T2Z, T32, T34, T37;
T2Z = T11 - T1e;
T32 = T30 - T31;
T33 = T2Z + T32;
T3b = T32 - T2Z;
T34 = T1s - T1F;
T37 = T35 - T36;
T38 = T34 - T37;
T3c = T34 + T37;
}
{
E T39, T3w, T3d, T3y;
T39 = T33 + T38;
ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
T3w = T3b + T3c;
ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
T3d = T3b - T3c;
ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
T3y = T38 - T33;
ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
}
}
{
E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
E T2C;
{
E T1V, T22, T2b, T2i;
T1O = T1I - T1N;
T3B = T3z - T3A;
T3H = T3A + T3z;
T2E = T1I + T1N;
T1V = T1T - T1U;
T22 = T1W + T21;
T23 = T1V - T22;
T3C = T1V + T22;
{
E T2M, T2N, T2F, T2G;
T2M = T2r + T2w;
T2N = T2o - T2p;
T2O = FNMS(KP414213562, T2N, T2M);
T2S = FMA(KP414213562, T2M, T2N);
T2F = T1U + T1T;
T2G = T1W - T21;
T2H = T2F + T2G;
T3I = T2G - T2F;
}
T2b = T29 + T2a;
T2i = T2c - T2h;
T2j = FMA(KP414213562, T2i, T2b);
T2B = FNMS(KP414213562, T2b, T2i);
{
E T2J, T2K, T2q, T2x;
T2J = T2c + T2h;
T2K = T29 - T2a;
T2L = FMA(KP414213562, T2K, T2J);
T2R = FNMS(KP414213562, T2J, T2K);
T2q = T2o + T2p;
T2x = T2r - T2w;
T2y = FNMS(KP414213562, T2x, T2q);
T2C = FMA(KP414213562, T2q, T2x);
}
}
{
E T24, T2z, T3J, T3K;
T24 = FMA(KP707106781, T23, T1O);
T2z = T2j - T2y;
ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
T3J = FMA(KP707106781, T3I, T3H);
T3K = T2C - T2B;
ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
}
{
E T2A, T2D, T3L, T3M;
T2A = FNMS(KP707106781, T23, T1O);
T2D = T2B + T2C;
ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
T3L = FNMS(KP707106781, T3I, T3H);
T3M = T2j + T2y;
ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
}
{
E T2I, T2P, T3D, T3E;
T2I = FMA(KP707106781, T2H, T2E);
T2P = T2L + T2O;
ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
T3D = FMA(KP707106781, T3C, T3B);
T3E = T2R + T2S;
ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
}
{
E T2Q, T2T, T3F, T3G;
T2Q = FNMS(KP707106781, T2H, T2E);
T2T = T2R - T2S;
ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
T3F = FNMS(KP707106781, T3C, T3B);
T3G = T2O - T2L;
ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 16 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 104, 30, 70, 0 }, 0, 0, 0 };
void X(codelet_t1_16) (planner *p) {
X(kdft_dit_register) (p, t1_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
/*
* This function contains 174 FP additions, 84 FP multiplications,
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
* 52 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
E T2y, T2z, T1O, T2g, T1T, T2h;
{
E T1, T2T, T6, T2S;
T1 = ri[0];
T2T = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 8)];
T5 = ii[WS(rs, 8)];
T2 = W[14];
T4 = W[15];
T6 = FMA(T2, T3, T4 * T5);
T2S = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T37 = T2T - T2S;
T1t = T1 - T6;
T2U = T2S + T2T;
}
{
E Tc, T1u, Th, T1v;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 4)];
Tb = ii[WS(rs, 4)];
T8 = W[6];
Ta = W[7];
Tc = FMA(T8, T9, Ta * Tb);
T1u = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 12)];
Tg = ii[WS(rs, 12)];
Td = W[22];
Tf = W[23];
Th = FMA(Td, Te, Tf * Tg);
T1v = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T38 = Tc - Th;
T1w = T1u - T1v;
T2R = T1u + T1v;
}
{
E To, T1y, Tt, T1z, T1A, T1B;
{
E Tl, Tn, Tk, Tm;
Tl = ri[WS(rs, 2)];
Tn = ii[WS(rs, 2)];
Tk = W[2];
Tm = W[3];
To = FMA(Tk, Tl, Tm * Tn);
T1y = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = ri[WS(rs, 10)];
Ts = ii[WS(rs, 10)];
Tp = W[18];
Tr = W[19];
Tt = FMA(Tp, Tq, Tr * Ts);
T1z = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T2s = T1y + T1z;
T1A = T1y - T1z;
T1B = To - Tt;
T1C = T1A - T1B;
T2c = T1B + T1A;
}
{
E Tz, T1E, TE, T1F, T1D, T1G;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 14)];
Ty = ii[WS(rs, 14)];
Tv = W[26];
Tx = W[27];
Tz = FMA(Tv, Tw, Tx * Ty);
T1E = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 6)];
TD = ii[WS(rs, 6)];
TA = W[10];
TC = W[11];
TE = FMA(TA, TB, TC * TD);
T1F = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T2t = T1E + T1F;
T1D = Tz - TE;
T1G = T1E - T1F;
T1H = T1D + T1G;
T2d = T1D - T1G;
}
{
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
{
E T16, T18, T15, T17;
T16 = ri[WS(rs, 15)];
T18 = ii[WS(rs, 15)];
T15 = W[28];
T17 = W[29];
T19 = FMA(T15, T16, T17 * T18);
T20 = FNMS(T17, T16, T15 * T18);
}
{
E T1m, T1o, T1l, T1n;
T1m = ri[WS(rs, 11)];
T1o = ii[WS(rs, 11)];
T1l = W[20];
T1n = W[21];
T1p = FMA(T1l, T1m, T1n * T1o);
T1X = FNMS(T1n, T1m, T1l * T1o);
}
{
E T1b, T1d, T1a, T1c;
T1b = ri[WS(rs, 7)];
T1d = ii[WS(rs, 7)];
T1a = W[12];
T1c = W[13];
T1e = FMA(T1a, T1b, T1c * T1d);
T21 = FNMS(T1c, T1b, T1a * T1d);
}
{
E T1h, T1j, T1g, T1i;
T1h = ri[WS(rs, 3)];
T1j = ii[WS(rs, 3)];
T1g = W[4];
T1i = W[5];
T1k = FMA(T1g, T1h, T1i * T1j);
T1W = FNMS(T1i, T1h, T1g * T1j);
}
T1f = T19 + T1e;
T1q = T1k + T1p;
T2B = T1f - T1q;
T2C = T20 + T21;
T2D = T1W + T1X;
T2E = T2C - T2D;
{
E T1V, T1Y, T22, T23;
T1V = T19 - T1e;
T1Y = T1W - T1X;
T1Z = T1V - T1Y;
T2j = T1V + T1Y;
T22 = T20 - T21;
T23 = T1k - T1p;
T24 = T22 + T23;
T2k = T22 - T23;
}
}
{
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
{
E TJ, TL, TI, TK;
TJ = ri[WS(rs, 1)];
TL = ii[WS(rs, 1)];
TI = W[0];
TK = W[1];
TM = FMA(TI, TJ, TK * TL);
T1K = FNMS(TK, TJ, TI * TL);
}
{
E TZ, T11, TY, T10;
TZ = ri[WS(rs, 13)];
T11 = ii[WS(rs, 13)];
TY = W[24];
T10 = W[25];
T12 = FMA(TY, TZ, T10 * T11);
T1R = FNMS(T10, TZ, TY * T11);
}
{
E TO, TQ, TN, TP;
TO = ri[WS(rs, 9)];
TQ = ii[WS(rs, 9)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1L = FNMS(TP, TO, TN * TQ);
}
{
E TU, TW, TT, TV;
TU = ri[WS(rs, 5)];
TW = ii[WS(rs, 5)];
TT = W[8];
TV = W[9];
TX = FMA(TT, TU, TV * TW);
T1Q = FNMS(TV, TU, TT * TW);
}
TS = TM + TR;
T13 = TX + T12;
T2w = TS - T13;
T2x = T1K + T1L;
T2y = T1Q + T1R;
T2z = T2x - T2y;
{
E T1M, T1N, T1P, T1S;
T1M = T1K - T1L;
T1N = TX - T12;
T1O = T1M + T1N;
T2g = T1M - T1N;
T1P = TM - TR;
T1S = T1Q - T1R;
T1T = T1P - T1S;
T2h = T1P + T1S;
}
}
{
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
{
E T1x, T1I, T3e, T3f;
T1x = T1t - T1w;
T1I = KP707106781 * (T1C - T1H);
T1J = T1x + T1I;
T27 = T1x - T1I;
T3e = KP707106781 * (T2d - T2c);
T3f = T38 + T37;
T3g = T3e + T3f;
T3i = T3f - T3e;
}
{
E T1U, T25, T28, T29;
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
T26 = T1U + T25;
T3h = T25 - T1U;
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
T2a = T28 - T29;
T3d = T28 + T29;
}
ri[WS(rs, 11)] = T1J - T26;
ii[WS(rs, 11)] = T3g - T3d;
ri[WS(rs, 3)] = T1J + T26;
ii[WS(rs, 3)] = T3d + T3g;
ri[WS(rs, 15)] = T27 - T2a;
ii[WS(rs, 15)] = T3i - T3h;
ri[WS(rs, 7)] = T27 + T2a;
ii[WS(rs, 7)] = T3h + T3i;
}
{
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
{
E T2r, T2u, T30, T31;
T2r = T7 - Ti;
T2u = T2s - T2t;
T2v = T2r + T2u;
T2H = T2r - T2u;
T30 = TF - Tu;
T31 = T2U - T2R;
T32 = T30 + T31;
T34 = T31 - T30;
}
{
E T2A, T2F, T2I, T2J;
T2A = T2w + T2z;
T2F = T2B - T2E;
T2G = KP707106781 * (T2A + T2F);
T33 = KP707106781 * (T2F - T2A);
T2I = T2z - T2w;
T2J = T2B + T2E;
T2K = KP707106781 * (T2I - T2J);
T2Z = KP707106781 * (T2I + T2J);
}
ri[WS(rs, 10)] = T2v - T2G;
ii[WS(rs, 10)] = T32 - T2Z;
ri[WS(rs, 2)] = T2v + T2G;
ii[WS(rs, 2)] = T2Z + T32;
ri[WS(rs, 14)] = T2H - T2K;
ii[WS(rs, 14)] = T34 - T33;
ri[WS(rs, 6)] = T2H + T2K;
ii[WS(rs, 6)] = T33 + T34;
}
{
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
{
E T2b, T2e, T36, T39;
T2b = T1t + T1w;
T2e = KP707106781 * (T2c + T2d);
T2f = T2b + T2e;
T2n = T2b - T2e;
T36 = KP707106781 * (T1C + T1H);
T39 = T37 - T38;
T3a = T36 + T39;
T3c = T39 - T36;
}
{
E T2i, T2l, T2o, T2p;
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
T2m = T2i + T2l;
T3b = T2l - T2i;
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
T2q = T2o - T2p;
T35 = T2o + T2p;
}
ri[WS(rs, 9)] = T2f - T2m;
ii[WS(rs, 9)] = T3a - T35;
ri[WS(rs, 1)] = T2f + T2m;
ii[WS(rs, 1)] = T35 + T3a;
ri[WS(rs, 13)] = T2n - T2q;
ii[WS(rs, 13)] = T3c - T3b;
ri[WS(rs, 5)] = T2n + T2q;
ii[WS(rs, 5)] = T3b + T3c;
}
{
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
{
E Tj, TG, T2Q, T2V;
Tj = T7 + Ti;
TG = Tu + TF;
TH = Tj + TG;
T2L = Tj - TG;
T2Q = T2s + T2t;
T2V = T2R + T2U;
T2W = T2Q + T2V;
T2Y = T2V - T2Q;
}
{
E T14, T1r, T2M, T2N;
T14 = TS + T13;
T1r = T1f + T1q;
T1s = T14 + T1r;
T2X = T1r - T14;
T2M = T2x + T2y;
T2N = T2C + T2D;
T2O = T2M - T2N;
T2P = T2M + T2N;
}
ri[WS(rs, 8)] = TH - T1s;
ii[WS(rs, 8)] = T2W - T2P;
ri[0] = TH + T1s;
ii[0] = T2P + T2W;
ri[WS(rs, 12)] = T2L - T2O;
ii[WS(rs, 12)] = T2Y - T2X;
ri[WS(rs, 4)] = T2L + T2O;
ii[WS(rs, 4)] = T2X + T2Y;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 16 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 136, 46, 38, 0 }, 0, 0, 0 };
void X(codelet_t1_16) (planner *p) {
X(kdft_dit_register) (p, t1_16, &desc);
}
#endif
+117
View File
@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 11 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
T1 = ri[0];
Ta = ii[0];
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
T8 = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
T9 = FNMS(T5, T3, T8);
ri[WS(rs, 1)] = T1 - T7;
ii[WS(rs, 1)] = Ta - T9;
ri[0] = T1 + T7;
ii[0] = T9 + Ta;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
void X(codelet_t1_2) (planner *p) {
X(kdft_dit_register) (p, t1_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, T8, T6, T7;
T1 = ri[0];
T8 = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
T7 = FNMS(T4, T3, T2 * T5);
}
ri[WS(rs, 1)] = T1 - T6;
ii[WS(rs, 1)] = T8 - T7;
ri[0] = T1 + T6;
ii[0] = T7 + T8;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
void X(codelet_t1_2) (planner *p) {
X(kdft_dit_register) (p, t1_2, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+166
View File
@@ -0,0 +1,166 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
/*
* This function contains 16 FP additions, 14 FP multiplications,
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, Tm, T7, Th, Td, Tj;
T1 = ri[0];
Tm = ii[0];
{
E T3, T6, T4, Tg, T2, T5;
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
Tg = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Th = FNMS(T5, T3, Tg);
}
{
E T9, Tc, Ta, Ti, T8, Tb;
T9 = ri[WS(rs, 2)];
Tc = ii[WS(rs, 2)];
T8 = W[2];
Ta = T8 * T9;
Ti = T8 * Tc;
Tb = W[3];
Td = FMA(Tb, Tc, Ta);
Tj = FNMS(Tb, T9, Ti);
}
{
E Tk, Te, Tf, To, Tl, Tn;
Tk = Th - Tj;
Te = T7 + Td;
Tf = FNMS(KP500000000, Te, T1);
ri[0] = T1 + Te;
ri[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
ri[WS(rs, 2)] = FNMS(KP866025403, Tk, Tf);
To = Td - T7;
Tl = Th + Tj;
Tn = FNMS(KP500000000, Tl, Tm);
ii[0] = Tl + Tm;
ii[WS(rs, 2)] = FNMS(KP866025403, To, Tn);
ii[WS(rs, 1)] = FMA(KP866025403, To, Tn);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 6, 4, 10, 0 }, 0, 0, 0 };
void X(codelet_t1_3) (planner *p) {
X(kdft_dit_register) (p, t1_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
/*
* This function contains 16 FP additions, 12 FP multiplications,
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, Ti, T6, Te, Tb, Tf, Tc, Th;
T1 = ri[0];
Ti = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
Te = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 2)];
Ta = ii[WS(rs, 2)];
T7 = W[2];
T9 = W[3];
Tb = FMA(T7, T8, T9 * Ta);
Tf = FNMS(T9, T8, T7 * Ta);
}
Tc = T6 + Tb;
Th = Te + Tf;
ri[0] = T1 + Tc;
ii[0] = Th + Ti;
{
E Td, Tg, Tj, Tk;
Td = FNMS(KP500000000, Tc, T1);
Tg = KP866025403 * (Te - Tf);
ri[WS(rs, 2)] = Td - Tg;
ri[WS(rs, 1)] = Td + Tg;
Tj = KP866025403 * (Tb - T6);
Tk = FNMS(KP500000000, Th, Ti);
ii[WS(rs, 1)] = Tj + Tk;
ii[WS(rs, 2)] = Tk - Tj;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 10, 6, 6, 0 }, 0, 0, 0 };
void X(codelet_t1_3) (planner *p) {
X(kdft_dit_register) (p, t1_3, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
+196
View File
@@ -0,0 +1,196 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
T1 = ri[0];
Tv = ii[0];
{
E T3, T6, T4, Tt, T2, T5;
T3 = ri[WS(rs, 2)];
T6 = ii[WS(rs, 2)];
T2 = W[2];
T4 = T2 * T3;
Tt = T2 * T6;
T5 = W[3];
T7 = FMA(T5, T6, T4);
Tu = FNMS(T5, T3, Tt);
}
{
E Ta, Td, Tb, Tn, T9, Tc;
Ta = ri[WS(rs, 1)];
Td = ii[WS(rs, 1)];
T9 = W[0];
Tb = T9 * Ta;
Tn = T9 * Td;
Tc = W[1];
Te = FMA(Tc, Td, Tb);
To = FNMS(Tc, Ta, Tn);
}
{
E Tg, Tj, Th, Tp, Tf, Ti;
Tg = ri[WS(rs, 3)];
Tj = ii[WS(rs, 3)];
Tf = W[4];
Th = Tf * Tg;
Tp = Tf * Tj;
Ti = W[5];
Tk = FMA(Ti, Tj, Th);
Tq = FNMS(Ti, Tg, Tp);
}
{
E T8, Tl, Ts, Tw;
T8 = T1 + T7;
Tl = Te + Tk;
ri[WS(rs, 2)] = T8 - Tl;
ri[0] = T8 + Tl;
Ts = To + Tq;
Tw = Tu + Tv;
ii[0] = Ts + Tw;
ii[WS(rs, 2)] = Tw - Ts;
}
{
E Tm, Tr, Tx, Ty;
Tm = T1 - T7;
Tr = To - Tq;
ri[WS(rs, 3)] = Tm - Tr;
ri[WS(rs, 1)] = Tm + Tr;
Tx = Tv - Tu;
Ty = Te - Tk;
ii[WS(rs, 1)] = Tx - Ty;
ii[WS(rs, 3)] = Ty + Tx;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
void X(codelet_t1_4) (planner *p) {
X(kdft_dit_register) (p, t1_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
T1 = ri[0];
Tp = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 2)];
T5 = ii[WS(rs, 2)];
T2 = W[2];
T4 = W[3];
T6 = FMA(T2, T3, T4 * T5);
To = FNMS(T4, T3, T2 * T5);
}
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 1)];
Tb = ii[WS(rs, 1)];
T8 = W[0];
Ta = W[1];
Tc = FMA(T8, T9, Ta * Tb);
Tk = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 3)];
Tg = ii[WS(rs, 3)];
Td = W[4];
Tf = W[5];
Th = FMA(Td, Te, Tf * Tg);
Tl = FNMS(Tf, Te, Td * Tg);
}
{
E T7, Ti, Tn, Tq;
T7 = T1 + T6;
Ti = Tc + Th;
ri[WS(rs, 2)] = T7 - Ti;
ri[0] = T7 + Ti;
Tn = Tk + Tl;
Tq = To + Tp;
ii[0] = Tn + Tq;
ii[WS(rs, 2)] = Tq - Tn;
}
{
E Tj, Tm, Tr, Ts;
Tj = T1 - T6;
Tm = Tk - Tl;
ri[WS(rs, 3)] = Tj - Tm;
ri[WS(rs, 1)] = Tj + Tm;
Tr = Tp - To;
Ts = Tc - Th;
ii[WS(rs, 1)] = Tr - Ts;
ii[WS(rs, 3)] = Ts + Tr;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
void X(codelet_t1_4) (planner *p) {
X(kdft_dit_register) (p, t1_4, &desc);
}
#endif
+253
View File
@@ -0,0 +1,253 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
/*
* This function contains 40 FP additions, 34 FP multiplications,
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
* 31 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, TM, T7, Tx, Td, Tz, Te, TJ, Tk, TC, Tq, TE, Tr, TK;
T1 = ri[0];
TM = ii[0];
{
E T3, T6, T4, Tw, T9, Tc, Ta, Ty, T2, T8, T5, Tb;
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
Tw = T2 * T6;
T9 = ri[WS(rs, 4)];
Tc = ii[WS(rs, 4)];
T8 = W[6];
Ta = T8 * T9;
Ty = T8 * Tc;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tx = FNMS(T5, T3, Tw);
Tb = W[7];
Td = FMA(Tb, Tc, Ta);
Tz = FNMS(Tb, T9, Ty);
Te = T7 + Td;
TJ = Tx + Tz;
}
{
E Tg, Tj, Th, TB, Tm, Tp, Tn, TD, Tf, Tl, Ti, To;
Tg = ri[WS(rs, 2)];
Tj = ii[WS(rs, 2)];
Tf = W[2];
Th = Tf * Tg;
TB = Tf * Tj;
Tm = ri[WS(rs, 3)];
Tp = ii[WS(rs, 3)];
Tl = W[4];
Tn = Tl * Tm;
TD = Tl * Tp;
Ti = W[3];
Tk = FMA(Ti, Tj, Th);
TC = FNMS(Ti, Tg, TB);
To = W[5];
Tq = FMA(To, Tp, Tn);
TE = FNMS(To, Tm, TD);
Tr = Tk + Tq;
TK = TC + TE;
}
{
E Tu, Ts, Tt, TG, TI, TA, TF, TH, Tv;
Tu = Te - Tr;
Ts = Te + Tr;
Tt = FNMS(KP250000000, Ts, T1);
TA = Tx - Tz;
TF = TC - TE;
TG = FMA(KP618033988, TF, TA);
TI = FNMS(KP618033988, TA, TF);
ri[0] = T1 + Ts;
TH = FNMS(KP559016994, Tu, Tt);
ri[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
ri[WS(rs, 3)] = FMA(KP951056516, TI, TH);
Tv = FMA(KP559016994, Tu, Tt);
ri[WS(rs, 4)] = FNMS(KP951056516, TG, Tv);
ri[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
}
{
E TO, TL, TN, TS, TU, TQ, TR, TT, TP;
TO = TJ - TK;
TL = TJ + TK;
TN = FNMS(KP250000000, TL, TM);
TQ = T7 - Td;
TR = Tk - Tq;
TS = FMA(KP618033988, TR, TQ);
TU = FNMS(KP618033988, TQ, TR);
ii[0] = TL + TM;
TT = FNMS(KP559016994, TO, TN);
ii[WS(rs, 2)] = FMA(KP951056516, TU, TT);
ii[WS(rs, 3)] = FNMS(KP951056516, TU, TT);
TP = FMA(KP559016994, TO, TN);
ii[WS(rs, 1)] = FNMS(KP951056516, TS, TP);
ii[WS(rs, 4)] = FMA(KP951056516, TS, TP);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 14, 8, 26, 0 }, 0, 0, 0 };
void X(codelet_t1_5) (planner *p) {
X(kdft_dit_register) (p, t1_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
/*
* This function contains 40 FP additions, 28 FP multiplications,
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
* 29 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, TE, Tu, Tx, TJ, TI, TB, TC, TD, Tc, Tn, To;
T1 = ri[0];
TE = ii[0];
{
E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
Ts = FNMS(T4, T3, T2 * T5);
}
{
E Tj, Tl, Ti, Tk;
Tj = ri[WS(rs, 3)];
Tl = ii[WS(rs, 3)];
Ti = W[4];
Tk = W[5];
Tm = FMA(Ti, Tj, Tk * Tl);
Tw = FNMS(Tk, Tj, Ti * Tl);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 4)];
Ta = ii[WS(rs, 4)];
T7 = W[6];
T9 = W[7];
Tb = FMA(T7, T8, T9 * Ta);
Tt = FNMS(T9, T8, T7 * Ta);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 2)];
Tg = ii[WS(rs, 2)];
Td = W[2];
Tf = W[3];
Th = FMA(Td, Te, Tf * Tg);
Tv = FNMS(Tf, Te, Td * Tg);
}
Tu = Ts - Tt;
Tx = Tv - Tw;
TJ = Th - Tm;
TI = T6 - Tb;
TB = Ts + Tt;
TC = Tv + Tw;
TD = TB + TC;
Tc = T6 + Tb;
Tn = Th + Tm;
To = Tc + Tn;
}
ri[0] = T1 + To;
ii[0] = TD + TE;
{
E Ty, TA, Tr, Tz, Tp, Tq;
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
Tp = KP559016994 * (Tc - Tn);
Tq = FNMS(KP250000000, To, T1);
Tr = Tp + Tq;
Tz = Tq - Tp;
ri[WS(rs, 4)] = Tr - Ty;
ri[WS(rs, 3)] = Tz + TA;
ri[WS(rs, 1)] = Tr + Ty;
ri[WS(rs, 2)] = Tz - TA;
}
{
E TK, TL, TH, TM, TF, TG;
TK = FMA(KP951056516, TI, KP587785252 * TJ);
TL = FNMS(KP587785252, TI, KP951056516 * TJ);
TF = KP559016994 * (TB - TC);
TG = FNMS(KP250000000, TD, TE);
TH = TF + TG;
TM = TG - TF;
ii[WS(rs, 1)] = TH - TK;
ii[WS(rs, 3)] = TM - TL;
ii[WS(rs, 4)] = TK + TH;
ii[WS(rs, 2)] = TL + TM;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 26, 14, 14, 0 }, 0, 0, 0 };
void X(codelet_t1_5) (planner *p) {
X(kdft_dit_register) (p, t1_5, &desc);
}
#endif
+295
View File
@@ -0,0 +1,295 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
/*
* This function contains 46 FP additions, 32 FP multiplications,
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T1, TX, T7, TW, Tl, TR, TB, TJ, Ty, TS, TC, TO;
T1 = ri[0];
TX = ii[0];
{
E T3, T6, T4, TV, T2, T5;
T3 = ri[WS(rs, 3)];
T6 = ii[WS(rs, 3)];
T2 = W[4];
T4 = T2 * T3;
TV = T2 * T6;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TW = FNMS(T5, T3, TV);
}
{
E Ta, Td, Tb, TF, Tg, Tj, Th, TH, T9, Tf;
Ta = ri[WS(rs, 2)];
Td = ii[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
TF = T9 * Td;
Tg = ri[WS(rs, 5)];
Tj = ii[WS(rs, 5)];
Tf = W[8];
Th = Tf * Tg;
TH = Tf * Tj;
{
E Te, TG, Tk, TI, Tc, Ti;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TG = FNMS(Tc, Ta, TF);
Ti = W[9];
Tk = FMA(Ti, Tj, Th);
TI = FNMS(Ti, Tg, TH);
Tl = Te - Tk;
TR = TG + TI;
TB = Te + Tk;
TJ = TG - TI;
}
}
{
E Tn, Tq, To, TK, Tt, Tw, Tu, TM, Tm, Ts;
Tn = ri[WS(rs, 4)];
Tq = ii[WS(rs, 4)];
Tm = W[6];
To = Tm * Tn;
TK = Tm * Tq;
Tt = ri[WS(rs, 1)];
Tw = ii[WS(rs, 1)];
Ts = W[0];
Tu = Ts * Tt;
TM = Ts * Tw;
{
E Tr, TL, Tx, TN, Tp, Tv;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
TL = FNMS(Tp, Tn, TK);
Tv = W[1];
Tx = FMA(Tv, Tw, Tu);
TN = FNMS(Tv, Tt, TM);
Ty = Tr - Tx;
TS = TL + TN;
TC = Tr + Tx;
TO = TL - TN;
}
}
{
E TP, T8, Tz, TE;
TP = TJ - TO;
T8 = T1 - T7;
Tz = Tl + Ty;
TE = FNMS(KP500000000, Tz, T8);
ri[WS(rs, 3)] = T8 + Tz;
ri[WS(rs, 1)] = FMA(KP866025403, TP, TE);
ri[WS(rs, 5)] = FNMS(KP866025403, TP, TE);
}
{
E T14, T11, T12, T13;
T14 = Ty - Tl;
T11 = TX - TW;
T12 = TJ + TO;
T13 = FNMS(KP500000000, T12, T11);
ii[WS(rs, 1)] = FMA(KP866025403, T14, T13);
ii[WS(rs, 3)] = T12 + T11;
ii[WS(rs, 5)] = FNMS(KP866025403, T14, T13);
}
{
E TT, TA, TD, TQ;
TT = TR - TS;
TA = T1 + T7;
TD = TB + TC;
TQ = FNMS(KP500000000, TD, TA);
ri[0] = TA + TD;
ri[WS(rs, 4)] = FMA(KP866025403, TT, TQ);
ri[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
}
{
E T10, TU, TY, TZ;
T10 = TC - TB;
TU = TR + TS;
TY = TW + TX;
TZ = FNMS(KP500000000, TU, TY);
ii[0] = TU + TY;
ii[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
ii[WS(rs, 2)] = FNMS(KP866025403, T10, TZ);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 6 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 24, 10, 22, 0 }, 0, 0, 0 };
void X(codelet_t1_6) (planner *p) {
X(kdft_dit_register) (p, t1_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
/*
* This function contains 46 FP additions, 28 FP multiplications,
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
{
E T1, TN, T6, TM;
T1 = ri[0];
TN = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 3)];
T5 = ii[WS(rs, 3)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TM = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
TS = TN - TM;
Tv = T1 + T6;
TO = TM + TN;
}
{
E Tn, TD, Ts, TE;
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 4)];
Tm = ii[WS(rs, 4)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TD = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 1)];
Tr = ii[WS(rs, 1)];
To = W[0];
Tq = W[1];
Ts = FMA(To, Tp, Tq * Tr);
TE = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn - Ts;
TJ = TD + TE;
Tx = Tn + Ts;
TF = TD - TE;
}
{
E Tc, TA, Th, TB;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 2)];
Tb = ii[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TA = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 5)];
Tg = ii[WS(rs, 5)];
Td = W[8];
Tf = W[9];
Th = FMA(Td, Te, Tf * Tg);
TB = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc - Th;
TI = TA + TB;
Tw = Tc + Th;
TC = TA - TB;
}
{
E TG, Tu, Tz, TR, TT, TU;
TG = KP866025403 * (TC - TF);
Tu = Ti + Tt;
Tz = FNMS(KP500000000, Tu, T7);
ri[WS(rs, 3)] = T7 + Tu;
ri[WS(rs, 1)] = Tz + TG;
ri[WS(rs, 5)] = Tz - TG;
TR = KP866025403 * (Tt - Ti);
TT = TC + TF;
TU = FNMS(KP500000000, TT, TS);
ii[WS(rs, 1)] = TR + TU;
ii[WS(rs, 3)] = TT + TS;
ii[WS(rs, 5)] = TU - TR;
}
{
E TK, Ty, TH, TQ, TL, TP;
TK = KP866025403 * (TI - TJ);
Ty = Tw + Tx;
TH = FNMS(KP500000000, Ty, Tv);
ri[0] = Tv + Ty;
ri[WS(rs, 4)] = TH + TK;
ri[WS(rs, 2)] = TH - TK;
TQ = KP866025403 * (Tx - Tw);
TL = TI + TJ;
TP = FNMS(KP500000000, TL, TO);
ii[0] = TL + TO;
ii[WS(rs, 4)] = TQ + TP;
ii[WS(rs, 2)] = TP - TQ;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 6 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 32, 14, 14, 0 }, 0, 0, 0 };
void X(codelet_t1_6) (planner *p) {
X(kdft_dit_register) (p, t1_6, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
+354
View File
@@ -0,0 +1,354 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
/*
* This function contains 72 FP additions, 66 FP multiplications,
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
* 37 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT m;
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, T1c, Te, T1h, TR, T19, Tr, T1g, TM, T1a, TE, T1i, TW, T1b;
T1 = ri[0];
T1c = ii[0];
{
E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
TN = T2 * T6;
T9 = ri[WS(rs, 6)];
Tc = ii[WS(rs, 6)];
T8 = W[10];
Ta = T8 * T9;
TP = T8 * Tc;
{
E T7, TO, Td, TQ, T5, Tb;
T5 = W[1];
T7 = FMA(T5, T6, T4);
TO = FNMS(T5, T3, TN);
Tb = W[11];
Td = FMA(Tb, Tc, Ta);
TQ = FNMS(Tb, T9, TP);
Te = T7 + Td;
T1h = Td - T7;
TR = TO - TQ;
T19 = TO + TQ;
}
}
{
E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
Tg = ri[WS(rs, 2)];
Tj = ii[WS(rs, 2)];
Tf = W[2];
Th = Tf * Tg;
TI = Tf * Tj;
Tm = ri[WS(rs, 5)];
Tp = ii[WS(rs, 5)];
Tl = W[8];
Tn = Tl * Tm;
TK = Tl * Tp;
{
E Tk, TJ, Tq, TL, Ti, To;
Ti = W[3];
Tk = FMA(Ti, Tj, Th);
TJ = FNMS(Ti, Tg, TI);
To = W[9];
Tq = FMA(To, Tp, Tn);
TL = FNMS(To, Tm, TK);
Tr = Tk + Tq;
T1g = Tq - Tk;
TM = TJ - TL;
T1a = TJ + TL;
}
}
{
E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
Tt = ri[WS(rs, 3)];
Tw = ii[WS(rs, 3)];
Ts = W[4];
Tu = Ts * Tt;
TS = Ts * Tw;
Tz = ri[WS(rs, 4)];
TC = ii[WS(rs, 4)];
Ty = W[6];
TA = Ty * Tz;
TU = Ty * TC;
{
E Tx, TT, TD, TV, Tv, TB;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
TT = FNMS(Tv, Tt, TS);
TB = W[7];
TD = FMA(TB, TC, TA);
TV = FNMS(TB, Tz, TU);
TE = Tx + TD;
T1i = TD - Tx;
TW = TT - TV;
T1b = TT + TV;
}
}
ri[0] = T1 + Te + Tr + TE;
ii[0] = T19 + T1a + T1b + T1c;
{
E TG, TY, TF, TX, TH;
TF = FNMS(KP356895867, Tr, Te);
TG = FNMS(KP692021471, TF, TE);
TX = FMA(KP554958132, TW, TR);
TY = FMA(KP801937735, TX, TM);
TH = FNMS(KP900968867, TG, T1);
ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
}
{
E T1e, T1k, T1d, T1j, T1f;
T1d = FNMS(KP356895867, T1a, T19);
T1e = FNMS(KP692021471, T1d, T1b);
T1j = FMA(KP554958132, T1i, T1h);
T1k = FMA(KP801937735, T1j, T1g);
T1f = FNMS(KP900968867, T1e, T1c);
ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
}
{
E T10, T13, TZ, T12, T11;
TZ = FNMS(KP356895867, Te, TE);
T10 = FNMS(KP692021471, TZ, Tr);
T12 = FMA(KP554958132, TM, TW);
T13 = FNMS(KP801937735, T12, TR);
T11 = FNMS(KP900968867, T10, T1);
ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
}
{
E T1m, T1p, T1l, T1o, T1n;
T1l = FNMS(KP356895867, T19, T1b);
T1m = FNMS(KP692021471, T1l, T1a);
T1o = FMA(KP554958132, T1g, T1i);
T1p = FNMS(KP801937735, T1o, T1h);
T1n = FNMS(KP900968867, T1m, T1c);
ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
}
{
E T15, T18, T14, T17, T16;
T14 = FNMS(KP356895867, TE, Tr);
T15 = FNMS(KP692021471, T14, Te);
T17 = FNMS(KP554958132, TR, TM);
T18 = FNMS(KP801937735, T17, TW);
T16 = FNMS(KP900968867, T15, T1);
ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
}
{
E T1r, T1u, T1q, T1t, T1s;
T1q = FNMS(KP356895867, T1b, T1a);
T1r = FNMS(KP692021471, T1q, T19);
T1t = FNMS(KP554958132, T1h, T1g);
T1u = FNMS(KP801937735, T1t, T1i);
T1s = FNMS(KP900968867, T1r, T1c);
ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 18, 12, 54, 0 }, 0, 0, 0 };
void X(codelet_t1_7) (planner *p) {
X(kdft_dit_register) (p, t1_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
/*
* This function contains 72 FP additions, 60 FP multiplications,
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
* 29 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT m;
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
T1 = ri[0];
TR = ii[0];
{
E T6, TA, Tb, TB;
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
TA = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 6)];
Ta = ii[WS(rs, 6)];
T7 = W[10];
T9 = W[11];
Tb = FMA(T7, T8, T9 * Ta);
TB = FNMS(T9, T8, T7 * Ta);
}
Tc = T6 + Tb;
TS = Tb - T6;
TC = TA - TB;
TO = TA + TB;
}
{
E Th, TG, Tm, TH;
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 2)];
Tg = ii[WS(rs, 2)];
Td = W[2];
Tf = W[3];
Th = FMA(Td, Te, Tf * Tg);
TG = FNMS(Tf, Te, Td * Tg);
}
{
E Tj, Tl, Ti, Tk;
Tj = ri[WS(rs, 5)];
Tl = ii[WS(rs, 5)];
Ti = W[8];
Tk = W[9];
Tm = FMA(Ti, Tj, Tk * Tl);
TH = FNMS(Tk, Tj, Ti * Tl);
}
Tn = Th + Tm;
TT = Tm - Th;
TI = TG - TH;
TP = TG + TH;
}
{
E Ts, TD, Tx, TE;
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 3)];
Tr = ii[WS(rs, 3)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
TD = FNMS(Tq, Tp, To * Tr);
}
{
E Tu, Tw, Tt, Tv;
Tu = ri[WS(rs, 4)];
Tw = ii[WS(rs, 4)];
Tt = W[6];
Tv = W[7];
Tx = FMA(Tt, Tu, Tv * Tw);
TE = FNMS(Tv, Tu, Tt * Tw);
}
Ty = Ts + Tx;
TU = Tx - Ts;
TF = TD - TE;
TQ = TD + TE;
}
ri[0] = T1 + Tc + Tn + Ty;
ii[0] = TO + TP + TQ + TR;
{
E TJ, Tz, TX, TY;
TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
ri[WS(rs, 5)] = Tz - TJ;
ri[WS(rs, 2)] = Tz + TJ;
TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
ii[WS(rs, 2)] = TX + TY;
ii[WS(rs, 5)] = TY - TX;
}
{
E TL, TK, TV, TW;
TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
ri[WS(rs, 6)] = TK - TL;
ri[WS(rs, 1)] = TK + TL;
TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
ii[WS(rs, 1)] = TV + TW;
ii[WS(rs, 6)] = TW - TV;
}
{
E TN, TM, TZ, T10;
TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
ri[WS(rs, 4)] = TM - TN;
ri[WS(rs, 3)] = TM + TN;
TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
ii[WS(rs, 3)] = TZ + T10;
ii[WS(rs, 4)] = T10 - TZ;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 36, 24, 36, 0 }, 0, 0, 0 };
void X(codelet_t1_7) (planner *p) {
X(kdft_dit_register) (p, t1_7, &desc);
}
#endif
+376
View File
@@ -0,0 +1,376 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
/*
* This function contains 66 FP additions, 36 FP multiplications,
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
* 34 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
E TX, Ty, TZ, TV, T10;
T1 = ri[0];
T1m = ii[0];
{
E T3, T6, T4, T1k, T2, T5;
T3 = ri[WS(rs, 4)];
T6 = ii[WS(rs, 4)];
T2 = W[6];
T4 = T2 * T3;
T1k = T2 * T6;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1l = FNMS(T5, T3, T1k);
}
{
E Tg, Tj, Th, TR, Tf, Ti;
Tg = ri[WS(rs, 6)];
Tj = ii[WS(rs, 6)];
Tf = W[10];
Th = Tf * Tg;
TR = Tf * Tj;
Ti = W[11];
Tk = FMA(Ti, Tj, Th);
TS = FNMS(Ti, Tg, TR);
}
{
E Ta, Td, Tb, TP, T9, Tc;
Ta = ri[WS(rs, 2)];
Td = ii[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
TP = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TQ = FNMS(Tc, Ta, TP);
}
{
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
TB = ri[WS(rs, 7)];
TE = ii[WS(rs, 7)];
TA = W[12];
TC = TA * TB;
T13 = TA * TE;
TH = ri[WS(rs, 3)];
TK = ii[WS(rs, 3)];
TG = W[4];
TI = TG * TH;
T15 = TG * TK;
TD = W[13];
TF = FMA(TD, TE, TC);
T14 = FNMS(TD, TB, T13);
TJ = W[5];
TL = FMA(TJ, TK, TI);
T16 = FNMS(TJ, TH, T15);
T12 = TF - TL;
T17 = T14 - T16;
}
{
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
To = ri[WS(rs, 1)];
Tr = ii[WS(rs, 1)];
Tn = W[0];
Tp = Tn * To;
TW = Tn * Tr;
Tu = ri[WS(rs, 5)];
Tx = ii[WS(rs, 5)];
Tt = W[8];
Tv = Tt * Tu;
TY = Tt * Tx;
Tq = W[1];
Ts = FMA(Tq, Tr, Tp);
TX = FNMS(Tq, To, TW);
Tw = W[9];
Ty = FMA(Tw, Tx, Tv);
TZ = FNMS(Tw, Tu, TY);
TV = Ts - Ty;
T10 = TX - TZ;
}
{
E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
{
E TO, TT, T1r, T1s;
TO = T1 - T7;
TT = TQ - TS;
TU = TO + TT;
T1a = TO - TT;
T1r = T1m - T1l;
T1s = Te - Tk;
T1t = T1r - T1s;
T1v = T1s + T1r;
}
{
E T11, T18, T1b, T1c;
T11 = TV + T10;
T18 = T12 - T17;
T19 = T11 + T18;
T1w = T18 - T11;
T1b = T10 - TV;
T1c = T12 + T17;
T1d = T1b - T1c;
T1u = T1b + T1c;
}
ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU);
ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t);
ri[WS(rs, 1)] = FMA(KP707106781, T19, TU);
ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a);
ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v);
ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
}
{
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
{
E T8, Tl, T1j, T1n;
T8 = T1 + T7;
Tl = Te + Tk;
Tm = T8 + Tl;
T1e = T8 - Tl;
T1j = TQ + TS;
T1n = T1l + T1m;
T1o = T1j + T1n;
T1q = T1n - T1j;
}
{
E Tz, TM, T1f, T1g;
Tz = Ts + Ty;
TM = TF + TL;
TN = Tz + TM;
T1p = TM - Tz;
T1f = TX + TZ;
T1g = T14 + T16;
T1h = T1f - T1g;
T1i = T1f + T1g;
}
ri[WS(rs, 4)] = Tm - TN;
ii[WS(rs, 4)] = T1o - T1i;
ri[0] = Tm + TN;
ii[0] = T1i + T1o;
ri[WS(rs, 6)] = T1e - T1h;
ii[WS(rs, 6)] = T1q - T1p;
ri[WS(rs, 2)] = T1e + T1h;
ii[WS(rs, 2)] = T1p + T1q;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 8 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 44, 14, 22, 0 }, 0, 0, 0 };
void X(codelet_t1_8) (planner *p) {
X(kdft_dit_register) (p, t1_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
/*
* This function contains 66 FP additions, 32 FP multiplications,
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
E TP;
{
E T1, T18, T6, T17;
T1 = ri[0];
T18 = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 4)];
T5 = ii[WS(rs, 4)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T17 = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T1e = T18 - T17;
TH = T1 - T6;
T19 = T17 + T18;
}
{
E Tz, TS, TE, TT;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 7)];
Ty = ii[WS(rs, 7)];
Tv = W[12];
Tx = W[13];
Tz = FMA(Tv, Tw, Tx * Ty);
TS = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 3)];
TD = ii[WS(rs, 3)];
TA = W[4];
TC = W[5];
TE = FMA(TA, TB, TC * TD);
TT = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T13 = TS + TT;
TR = Tz - TE;
TU = TS - TT;
}
{
E Tc, TI, Th, TJ;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 2)];
Tb = ii[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TI = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 6)];
Tg = ii[WS(rs, 6)];
Td = W[10];
Tf = W[11];
Th = FMA(Td, Te, Tf * Tg);
TJ = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T1f = Tc - Th;
TK = TI - TJ;
T16 = TI + TJ;
}
{
E To, TN, Tt, TO;
{
E Tl, Tn, Tk, Tm;
Tl = ri[WS(rs, 1)];
Tn = ii[WS(rs, 1)];
Tk = W[0];
Tm = W[1];
To = FMA(Tk, Tl, Tm * Tn);
TN = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = ri[WS(rs, 5)];
Ts = ii[WS(rs, 5)];
Tp = W[8];
Tr = W[9];
Tt = FMA(Tp, Tq, Tr * Ts);
TO = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T12 = TN + TO;
TM = To - Tt;
TP = TN - TO;
}
{
E Tj, TG, T1b, T1c;
Tj = T7 + Ti;
TG = Tu + TF;
ri[WS(rs, 4)] = Tj - TG;
ri[0] = Tj + TG;
{
E T15, T1a, T11, T14;
T15 = T12 + T13;
T1a = T16 + T19;
ii[0] = T15 + T1a;
ii[WS(rs, 4)] = T1a - T15;
T11 = T7 - Ti;
T14 = T12 - T13;
ri[WS(rs, 6)] = T11 - T14;
ri[WS(rs, 2)] = T11 + T14;
}
T1b = TF - Tu;
T1c = T19 - T16;
ii[WS(rs, 2)] = T1b + T1c;
ii[WS(rs, 6)] = T1c - T1b;
{
E TX, T1g, T10, T1d, TY, TZ;
TX = TH - TK;
T1g = T1e - T1f;
TY = TP - TM;
TZ = TR + TU;
T10 = KP707106781 * (TY - TZ);
T1d = KP707106781 * (TY + TZ);
ri[WS(rs, 7)] = TX - T10;
ii[WS(rs, 5)] = T1g - T1d;
ri[WS(rs, 3)] = TX + T10;
ii[WS(rs, 1)] = T1d + T1g;
}
{
E TL, T1i, TW, T1h, TQ, TV;
TL = TH + TK;
T1i = T1f + T1e;
TQ = TM + TP;
TV = TR - TU;
TW = KP707106781 * (TQ + TV);
T1h = KP707106781 * (TV - TQ);
ri[WS(rs, 5)] = TL - TW;
ii[WS(rs, 7)] = T1i - T1h;
ri[WS(rs, 1)] = TL + TW;
ii[WS(rs, 3)] = T1h + T1i;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 8 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 52, 18, 14, 0 }, 0, 0, 0 };
void X(codelet_t1_8) (planner *p) {
X(kdft_dit_register) (p, t1_8, &desc);
}
#endif
+487
View File
@@ -0,0 +1,487 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
/*
* This function contains 96 FP additions, 88 FP multiplications,
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
* 55 stack variables, 10 constants, and 36 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T1, T1R, Te, T1W, T10, T1Q, T1l, T1r, Ty, T1p, Tl, T1o, T1g, T1q, T1a;
E T1d, TS, T18, TF, T13, T19, T1c;
T1 = ri[0];
T1R = ii[0];
{
E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8;
T3 = ri[WS(rs, 3)];
T6 = ii[WS(rs, 3)];
T2 = W[4];
T4 = T2 * T3;
TW = T2 * T6;
T9 = ri[WS(rs, 6)];
Tc = ii[WS(rs, 6)];
T8 = W[10];
Ta = T8 * T9;
TY = T8 * Tc;
{
E T7, TX, Td, TZ, T5, Tb;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TX = FNMS(T5, T3, TW);
Tb = W[11];
Td = FMA(Tb, Tc, Ta);
TZ = FNMS(Tb, T9, TY);
Te = T7 + Td;
T1W = Td - T7;
T10 = TX - TZ;
T1Q = TX + TZ;
}
}
{
E Th, Tk, Ti, T1n, Tx, T1i, Tr, T1k, Tg, Tj;
Th = ri[WS(rs, 1)];
Tk = ii[WS(rs, 1)];
Tg = W[0];
Ti = Tg * Th;
T1n = Tg * Tk;
{
E Tt, Tw, Tu, T1h, Ts, Tv;
Tt = ri[WS(rs, 7)];
Tw = ii[WS(rs, 7)];
Ts = W[12];
Tu = Ts * Tt;
T1h = Ts * Tw;
Tv = W[13];
Tx = FMA(Tv, Tw, Tu);
T1i = FNMS(Tv, Tt, T1h);
}
{
E Tn, Tq, To, T1j, Tm, Tp;
Tn = ri[WS(rs, 4)];
Tq = ii[WS(rs, 4)];
Tm = W[6];
To = Tm * Tn;
T1j = Tm * Tq;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
T1k = FNMS(Tp, Tn, T1j);
}
T1l = T1i - T1k;
T1r = Tr - Tx;
Ty = Tr + Tx;
T1p = T1k + T1i;
Tj = W[1];
Tl = FMA(Tj, Tk, Ti);
T1o = FNMS(Tj, Th, T1n);
T1g = FNMS(KP500000000, Ty, Tl);
T1q = FNMS(KP500000000, T1p, T1o);
}
{
E TB, TE, TC, T12, TR, T17, TL, T15, TA, TD;
TB = ri[WS(rs, 2)];
TE = ii[WS(rs, 2)];
TA = W[2];
TC = TA * TB;
T12 = TA * TE;
{
E TN, TQ, TO, T16, TM, TP;
TN = ri[WS(rs, 8)];
TQ = ii[WS(rs, 8)];
TM = W[14];
TO = TM * TN;
T16 = TM * TQ;
TP = W[15];
TR = FMA(TP, TQ, TO);
T17 = FNMS(TP, TN, T16);
}
{
E TH, TK, TI, T14, TG, TJ;
TH = ri[WS(rs, 5)];
TK = ii[WS(rs, 5)];
TG = W[8];
TI = TG * TH;
T14 = TG * TK;
TJ = W[9];
TL = FMA(TJ, TK, TI);
T15 = FNMS(TJ, TH, T14);
}
T1a = TR - TL;
T1d = T15 - T17;
TS = TL + TR;
T18 = T15 + T17;
TD = W[3];
TF = FMA(TD, TE, TC);
T13 = FNMS(TD, TB, T12);
T19 = FNMS(KP500000000, T18, T13);
T1c = FNMS(KP500000000, TS, TF);
}
{
E Tf, T1S, TU, T1U, T1O, T1P, T1L, T1T;
Tf = T1 + Te;
T1S = T1Q + T1R;
{
E Tz, TT, T1M, T1N;
Tz = Tl + Ty;
TT = TF + TS;
TU = Tz + TT;
T1U = TT - Tz;
T1M = T1o + T1p;
T1N = T13 + T18;
T1O = T1M - T1N;
T1P = T1M + T1N;
}
ri[0] = Tf + TU;
ii[0] = T1P + T1S;
T1L = FNMS(KP500000000, TU, Tf);
ri[WS(rs, 6)] = FNMS(KP866025403, T1O, T1L);
ri[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
T1T = FNMS(KP500000000, T1P, T1S);
ii[WS(rs, 3)] = FMA(KP866025403, T1U, T1T);
ii[WS(rs, 6)] = FNMS(KP866025403, T1U, T1T);
}
{
E T11, T1z, T1X, T21, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G;
E T22, TV, T1V;
TV = FNMS(KP500000000, Te, T1);
T11 = FMA(KP866025403, T10, TV);
T1z = FNMS(KP866025403, T10, TV);
T1V = FNMS(KP500000000, T1Q, T1R);
T1X = FMA(KP866025403, T1W, T1V);
T21 = FNMS(KP866025403, T1W, T1V);
{
E T1b, T1e, T1m, T1s;
T1b = FMA(KP866025403, T1a, T19);
T1e = FMA(KP866025403, T1d, T1c);
T1f = FMA(KP176326980, T1e, T1b);
T1w = FNMS(KP176326980, T1b, T1e);
T1m = FNMS(KP866025403, T1l, T1g);
T1s = FNMS(KP866025403, T1r, T1q);
T1t = FMA(KP839099631, T1s, T1m);
T1x = FNMS(KP839099631, T1m, T1s);
}
T1u = FMA(KP777861913, T1t, T1f);
T1Y = FNMS(KP777861913, T1x, T1w);
{
E T1A, T1B, T1D, T1E;
T1A = FMA(KP866025403, T1r, T1q);
T1B = FMA(KP866025403, T1l, T1g);
T1C = FMA(KP176326980, T1B, T1A);
T1I = FNMS(KP176326980, T1A, T1B);
T1D = FNMS(KP866025403, T1d, T1c);
T1E = FNMS(KP866025403, T1a, T19);
T1F = FNMS(KP363970234, T1E, T1D);
T1J = FMA(KP363970234, T1D, T1E);
}
T1G = FNMS(KP954188894, T1F, T1C);
T22 = FMA(KP954188894, T1J, T1I);
ri[WS(rs, 1)] = FMA(KP984807753, T1u, T11);
ii[WS(rs, 1)] = FNMS(KP984807753, T1Y, T1X);
ri[WS(rs, 2)] = FMA(KP984807753, T1G, T1z);
ii[WS(rs, 2)] = FNMS(KP984807753, T22, T21);
{
E T1v, T1y, T1Z, T20;
T1v = FNMS(KP492403876, T1u, T11);
T1y = FMA(KP777861913, T1x, T1w);
ri[WS(rs, 4)] = FMA(KP852868531, T1y, T1v);
ri[WS(rs, 7)] = FNMS(KP852868531, T1y, T1v);
T1Z = FMA(KP492403876, T1Y, T1X);
T20 = FNMS(KP777861913, T1t, T1f);
ii[WS(rs, 4)] = FMA(KP852868531, T20, T1Z);
ii[WS(rs, 7)] = FNMS(KP852868531, T20, T1Z);
}
{
E T1H, T1K, T23, T24;
T1H = FNMS(KP492403876, T1G, T1z);
T1K = FNMS(KP954188894, T1J, T1I);
ri[WS(rs, 5)] = FNMS(KP852868531, T1K, T1H);
ri[WS(rs, 8)] = FMA(KP852868531, T1K, T1H);
T23 = FMA(KP492403876, T22, T21);
T24 = FMA(KP954188894, T1F, T1C);
ii[WS(rs, 5)] = FNMS(KP852868531, T24, T23);
ii[WS(rs, 8)] = FMA(KP852868531, T24, T23);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 24, 16, 72, 0 }, 0, 0, 0 };
void X(codelet_t1_9) (planner *p) {
X(kdft_dit_register) (p, t1_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
/*
* This function contains 96 FP additions, 72 FP multiplications,
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
* 41 stack variables, 8 constants, and 36 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu;
E T1w, TW, T1k, T11, T1l;
{
E T6, TO, Tb, TP;
T1 = ri[0];
T1B = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 3)];
T5 = ii[WS(rs, 3)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TO = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 6)];
Ta = ii[WS(rs, 6)];
T7 = W[10];
T9 = W[11];
Tb = FMA(T7, T8, T9 * Ta);
TP = FNMS(T9, T8, T7 * Ta);
}
TQ = KP866025403 * (TO - TP);
T1G = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
TN = FNMS(KP500000000, Tc, T1);
T1A = TO + TP;
T1H = FNMS(KP500000000, T1A, T1B);
}
{
E Tz, T19, TE, T14, TJ, T15, TK, T1a;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 2)];
Ty = ii[WS(rs, 2)];
Tv = W[2];
Tx = W[3];
Tz = FMA(Tv, Tw, Tx * Ty);
T19 = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 5)];
TD = ii[WS(rs, 5)];
TA = W[8];
TC = W[9];
TE = FMA(TA, TB, TC * TD);
T14 = FNMS(TC, TB, TA * TD);
}
{
E TG, TI, TF, TH;
TG = ri[WS(rs, 8)];
TI = ii[WS(rs, 8)];
TF = W[14];
TH = W[15];
TJ = FMA(TF, TG, TH * TI);
T15 = FNMS(TH, TG, TF * TI);
}
TK = TE + TJ;
T1a = T14 + T15;
TL = Tz + TK;
T1x = T19 + T1a;
{
E T13, T16, T18, T1b;
T13 = FNMS(KP500000000, TK, Tz);
T16 = KP866025403 * (T14 - T15);
T17 = T13 + T16;
T1o = T13 - T16;
T18 = KP866025403 * (TJ - TE);
T1b = FNMS(KP500000000, T1a, T19);
T1c = T18 + T1b;
T1n = T1b - T18;
}
}
{
E Ti, TY, Tn, TT, Ts, TU, Tt, TZ;
{
E Tf, Th, Te, Tg;
Tf = ri[WS(rs, 1)];
Th = ii[WS(rs, 1)];
Te = W[0];
Tg = W[1];
Ti = FMA(Te, Tf, Tg * Th);
TY = FNMS(Tg, Tf, Te * Th);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 4)];
Tm = ii[WS(rs, 4)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TT = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 7)];
Tr = ii[WS(rs, 7)];
To = W[12];
Tq = W[13];
Ts = FMA(To, Tp, Tq * Tr);
TU = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn + Ts;
TZ = TT + TU;
Tu = Ti + Tt;
T1w = TY + TZ;
{
E TS, TV, TX, T10;
TS = FNMS(KP500000000, Tt, Ti);
TV = KP866025403 * (TT - TU);
TW = TS + TV;
T1k = TS - TV;
TX = KP866025403 * (Ts - Tn);
T10 = FNMS(KP500000000, TZ, TY);
T11 = TX + T10;
T1l = T10 - TX;
}
}
{
E T1y, Td, TM, T1v;
T1y = KP866025403 * (T1w - T1x);
Td = T1 + Tc;
TM = Tu + TL;
T1v = FNMS(KP500000000, TM, Td);
ri[0] = Td + TM;
ri[WS(rs, 3)] = T1v + T1y;
ri[WS(rs, 6)] = T1v - T1y;
}
{
E T1D, T1z, T1C, T1E;
T1D = KP866025403 * (TL - Tu);
T1z = T1w + T1x;
T1C = T1A + T1B;
T1E = FNMS(KP500000000, T1z, T1C);
ii[0] = T1z + T1C;
ii[WS(rs, 6)] = T1E - T1D;
ii[WS(rs, 3)] = T1D + T1E;
}
{
E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K;
TR = TN + TQ;
T1I = T1G + T1H;
{
E T12, T1d, T1g, T1h;
T12 = FMA(KP766044443, TW, KP642787609 * T11);
T1d = FMA(KP173648177, T17, KP984807753 * T1c);
T1e = T12 + T1d;
T1J = KP866025403 * (T1d - T12);
T1g = FNMS(KP642787609, TW, KP766044443 * T11);
T1h = FNMS(KP984807753, T17, KP173648177 * T1c);
T1i = KP866025403 * (T1g - T1h);
T1F = T1g + T1h;
}
ri[WS(rs, 1)] = TR + T1e;
ii[WS(rs, 1)] = T1F + T1I;
T1f = FNMS(KP500000000, T1e, TR);
ri[WS(rs, 7)] = T1f - T1i;
ri[WS(rs, 4)] = T1f + T1i;
T1K = FNMS(KP500000000, T1F, T1I);
ii[WS(rs, 4)] = T1J + T1K;
ii[WS(rs, 7)] = T1K - T1J;
}
{
E T1j, T1M, T1q, T1N, T1u, T1L, T1r, T1O;
T1j = TN - TQ;
T1M = T1H - T1G;
{
E T1m, T1p, T1s, T1t;
T1m = FMA(KP173648177, T1k, KP984807753 * T1l);
T1p = FNMS(KP939692620, T1o, KP342020143 * T1n);
T1q = T1m + T1p;
T1N = KP866025403 * (T1p - T1m);
T1s = FNMS(KP984807753, T1k, KP173648177 * T1l);
T1t = FMA(KP342020143, T1o, KP939692620 * T1n);
T1u = KP866025403 * (T1s + T1t);
T1L = T1s - T1t;
}
ri[WS(rs, 2)] = T1j + T1q;
ii[WS(rs, 2)] = T1L + T1M;
T1r = FNMS(KP500000000, T1q, T1j);
ri[WS(rs, 8)] = T1r - T1u;
ri[WS(rs, 5)] = T1r + T1u;
T1O = FNMS(KP500000000, T1L, T1M);
ii[WS(rs, 5)] = T1N + T1O;
ii[WS(rs, 8)] = T1O - T1N;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 60, 36, 36, 0 }, 0, 0, 0 };
void X(codelet_t1_9) (planner *p) {
X(kdft_dit_register) (p, t1_9, &desc);
}
#endif
+509
View File
@@ -0,0 +1,509 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
/*
* This function contains 114 FP additions, 94 FP multiplications,
* (or, 48 additions, 28 multiplications, 66 fused multiply/add),
* 63 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
E T2, T3, T8, Tc, T5, T6, Tl, T7, TB, TF, T12, TY, To, Ts, Tw;
E Tb, Td, Th;
{
E TA, TX, TE, T11, Ta, T4;
T2 = W[0];
T3 = W[2];
T4 = T2 * T3;
T8 = W[4];
TA = T2 * T8;
TX = T3 * T8;
Tc = W[5];
TE = T2 * Tc;
T11 = T3 * Tc;
T5 = W[1];
T6 = W[3];
Ta = T2 * T6;
Tl = FMA(T5, T6, T4);
T7 = FNMS(T5, T6, T4);
TB = FMA(T5, Tc, TA);
TF = FNMS(T5, T8, TE);
T12 = FNMS(T6, T8, T11);
TY = FMA(T6, Tc, TX);
{
E Tr, Tv, T9, Tg;
Tr = Tl * T8;
Tv = Tl * Tc;
To = FNMS(T5, T3, Ta);
Ts = FMA(To, Tc, Tr);
Tw = FNMS(To, T8, Tv);
T9 = T7 * T8;
Tg = T7 * Tc;
Tb = FMA(T5, T3, Ta);
Td = FMA(Tb, Tc, T9);
Th = FNMS(Tb, T8, Tg);
}
}
{
E Tk, T1c, T24, T2d, TW, T19, T1a, T1P, T1Q, T1Z, T1g, T1h, T1i, T1C, T1H;
E T2f, Tz, TM, TN, T1S, T1T, T1Y, T1d, T1e, T1f, T1r, T1w, T2e;
{
E T1, T23, Te, Tf, Ti, T21, Tj, T22;
T1 = ri[0];
T23 = ii[0];
Te = ri[WS(rs, 5)];
Tf = Td * Te;
Ti = ii[WS(rs, 5)];
T21 = Td * Ti;
Tj = FMA(Th, Ti, Tf);
Tk = T1 - Tj;
T1c = T1 + Tj;
T22 = FNMS(Th, Te, T21);
T24 = T22 + T23;
T2d = T23 - T22;
}
{
E TR, T1z, T18, T1G, TV, T1B, T14, T1E;
{
E TO, TP, TQ, T1y;
TO = ri[WS(rs, 4)];
TP = T7 * TO;
TQ = ii[WS(rs, 4)];
T1y = T7 * TQ;
TR = FMA(Tb, TQ, TP);
T1z = FNMS(Tb, TO, T1y);
}
{
E T15, T16, T17, T1F;
T15 = ri[WS(rs, 1)];
T16 = T2 * T15;
T17 = ii[WS(rs, 1)];
T1F = T2 * T17;
T18 = FMA(T5, T17, T16);
T1G = FNMS(T5, T15, T1F);
}
{
E TS, TT, TU, T1A;
TS = ri[WS(rs, 9)];
TT = T8 * TS;
TU = ii[WS(rs, 9)];
T1A = T8 * TU;
TV = FMA(Tc, TU, TT);
T1B = FNMS(Tc, TS, T1A);
}
{
E TZ, T10, T13, T1D;
TZ = ri[WS(rs, 6)];
T10 = TY * TZ;
T13 = ii[WS(rs, 6)];
T1D = TY * T13;
T14 = FMA(T12, T13, T10);
T1E = FNMS(T12, TZ, T1D);
}
TW = TR - TV;
T19 = T14 - T18;
T1a = TW + T19;
T1P = T1z + T1B;
T1Q = T1E + T1G;
T1Z = T1P + T1Q;
T1g = TR + TV;
T1h = T14 + T18;
T1i = T1g + T1h;
T1C = T1z - T1B;
T1H = T1E - T1G;
T2f = T1C + T1H;
}
{
E Tq, T1o, TL, T1v, Ty, T1q, TH, T1t;
{
E Tm, Tn, Tp, T1n;
Tm = ri[WS(rs, 2)];
Tn = Tl * Tm;
Tp = ii[WS(rs, 2)];
T1n = Tl * Tp;
Tq = FMA(To, Tp, Tn);
T1o = FNMS(To, Tm, T1n);
}
{
E TI, TJ, TK, T1u;
TI = ri[WS(rs, 3)];
TJ = T3 * TI;
TK = ii[WS(rs, 3)];
T1u = T3 * TK;
TL = FMA(T6, TK, TJ);
T1v = FNMS(T6, TI, T1u);
}
{
E Tt, Tu, Tx, T1p;
Tt = ri[WS(rs, 7)];
Tu = Ts * Tt;
Tx = ii[WS(rs, 7)];
T1p = Ts * Tx;
Ty = FMA(Tw, Tx, Tu);
T1q = FNMS(Tw, Tt, T1p);
}
{
E TC, TD, TG, T1s;
TC = ri[WS(rs, 8)];
TD = TB * TC;
TG = ii[WS(rs, 8)];
T1s = TB * TG;
TH = FMA(TF, TG, TD);
T1t = FNMS(TF, TC, T1s);
}
Tz = Tq - Ty;
TM = TH - TL;
TN = Tz + TM;
T1S = T1o + T1q;
T1T = T1t + T1v;
T1Y = T1S + T1T;
T1d = Tq + Ty;
T1e = TH + TL;
T1f = T1d + T1e;
T1r = T1o - T1q;
T1w = T1t - T1v;
T2e = T1r + T1w;
}
{
E T1l, T1b, T1k, T1J, T1L, T1x, T1I, T1K, T1m;
T1l = TN - T1a;
T1b = TN + T1a;
T1k = FNMS(KP250000000, T1b, Tk);
T1x = T1r - T1w;
T1I = T1C - T1H;
T1J = FMA(KP618033988, T1I, T1x);
T1L = FNMS(KP618033988, T1x, T1I);
ri[WS(rs, 5)] = Tk + T1b;
T1K = FNMS(KP559016994, T1l, T1k);
ri[WS(rs, 7)] = FNMS(KP951056516, T1L, T1K);
ri[WS(rs, 3)] = FMA(KP951056516, T1L, T1K);
T1m = FMA(KP559016994, T1l, T1k);
ri[WS(rs, 9)] = FNMS(KP951056516, T1J, T1m);
ri[WS(rs, 1)] = FMA(KP951056516, T1J, T1m);
}
{
E T2i, T2g, T2h, T2m, T2o, T2k, T2l, T2n, T2j;
T2i = T2e - T2f;
T2g = T2e + T2f;
T2h = FNMS(KP250000000, T2g, T2d);
T2k = Tz - TM;
T2l = TW - T19;
T2m = FMA(KP618033988, T2l, T2k);
T2o = FNMS(KP618033988, T2k, T2l);
ii[WS(rs, 5)] = T2g + T2d;
T2n = FNMS(KP559016994, T2i, T2h);
ii[WS(rs, 3)] = FNMS(KP951056516, T2o, T2n);
ii[WS(rs, 7)] = FMA(KP951056516, T2o, T2n);
T2j = FMA(KP559016994, T2i, T2h);
ii[WS(rs, 1)] = FNMS(KP951056516, T2m, T2j);
ii[WS(rs, 9)] = FMA(KP951056516, T2m, T2j);
}
{
E T1N, T1j, T1M, T1V, T1X, T1R, T1U, T1W, T1O;
T1N = T1f - T1i;
T1j = T1f + T1i;
T1M = FNMS(KP250000000, T1j, T1c);
T1R = T1P - T1Q;
T1U = T1S - T1T;
T1V = FNMS(KP618033988, T1U, T1R);
T1X = FMA(KP618033988, T1R, T1U);
ri[0] = T1c + T1j;
T1W = FMA(KP559016994, T1N, T1M);
ri[WS(rs, 4)] = FNMS(KP951056516, T1X, T1W);
ri[WS(rs, 6)] = FMA(KP951056516, T1X, T1W);
T1O = FNMS(KP559016994, T1N, T1M);
ri[WS(rs, 2)] = FNMS(KP951056516, T1V, T1O);
ri[WS(rs, 8)] = FMA(KP951056516, T1V, T1O);
}
{
E T26, T20, T25, T2a, T2c, T28, T29, T2b, T27;
T26 = T1Y - T1Z;
T20 = T1Y + T1Z;
T25 = FNMS(KP250000000, T20, T24);
T28 = T1g - T1h;
T29 = T1d - T1e;
T2a = FNMS(KP618033988, T29, T28);
T2c = FMA(KP618033988, T28, T29);
ii[0] = T20 + T24;
T2b = FMA(KP559016994, T26, T25);
ii[WS(rs, 4)] = FMA(KP951056516, T2c, T2b);
ii[WS(rs, 6)] = FNMS(KP951056516, T2c, T2b);
T27 = FNMS(KP559016994, T26, T25);
ii[WS(rs, 2)] = FMA(KP951056516, T2a, T27);
ii[WS(rs, 8)] = FNMS(KP951056516, T2a, T27);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 48, 28, 66, 0 }, 0, 0, 0 };
void X(codelet_t2_10) (planner *p) {
X(kdft_dit_register) (p, t2_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
/*
* This function contains 114 FP additions, 80 FP multiplications,
* (or, 76 additions, 42 multiplications, 38 fused multiply/add),
* 63 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
E T2, T5, T3, T6, T8, Tm, Tc, Tk, T9, Td, Te, TM, TO, Tg, Tp;
E Tv, Tx, Tr;
{
E T4, Tb, T7, Ta;
T2 = W[0];
T5 = W[1];
T3 = W[2];
T6 = W[3];
T4 = T2 * T3;
Tb = T5 * T3;
T7 = T5 * T6;
Ta = T2 * T6;
T8 = T4 - T7;
Tm = Ta - Tb;
Tc = Ta + Tb;
Tk = T4 + T7;
T9 = W[4];
Td = W[5];
Te = FMA(T8, T9, Tc * Td);
TM = FMA(T3, T9, T6 * Td);
TO = FNMS(T6, T9, T3 * Td);
Tg = FNMS(Tc, T9, T8 * Td);
Tp = FMA(Tk, T9, Tm * Td);
Tv = FMA(T2, T9, T5 * Td);
Tx = FNMS(T5, T9, T2 * Td);
Tr = FNMS(Tm, T9, Tk * Td);
}
{
E Tj, T1S, TX, T1G, TL, TU, TV, T1s, T1t, T1C, T11, T12, T13, T1h, T1k;
E T1Q, Tu, TD, TE, T1v, T1w, T1B, TY, TZ, T10, T1a, T1d, T1P;
{
E T1, T1F, Ti, T1E, Tf, Th;
T1 = ri[0];
T1F = ii[0];
Tf = ri[WS(rs, 5)];
Th = ii[WS(rs, 5)];
Ti = FMA(Te, Tf, Tg * Th);
T1E = FNMS(Tg, Tf, Te * Th);
Tj = T1 - Ti;
T1S = T1F - T1E;
TX = T1 + Ti;
T1G = T1E + T1F;
}
{
E TH, T1f, TT, T1j, TK, T1g, TQ, T1i;
{
E TF, TG, TR, TS;
TF = ri[WS(rs, 4)];
TG = ii[WS(rs, 4)];
TH = FMA(T8, TF, Tc * TG);
T1f = FNMS(Tc, TF, T8 * TG);
TR = ri[WS(rs, 1)];
TS = ii[WS(rs, 1)];
TT = FMA(T2, TR, T5 * TS);
T1j = FNMS(T5, TR, T2 * TS);
}
{
E TI, TJ, TN, TP;
TI = ri[WS(rs, 9)];
TJ = ii[WS(rs, 9)];
TK = FMA(T9, TI, Td * TJ);
T1g = FNMS(Td, TI, T9 * TJ);
TN = ri[WS(rs, 6)];
TP = ii[WS(rs, 6)];
TQ = FMA(TM, TN, TO * TP);
T1i = FNMS(TO, TN, TM * TP);
}
TL = TH - TK;
TU = TQ - TT;
TV = TL + TU;
T1s = T1f + T1g;
T1t = T1i + T1j;
T1C = T1s + T1t;
T11 = TH + TK;
T12 = TQ + TT;
T13 = T11 + T12;
T1h = T1f - T1g;
T1k = T1i - T1j;
T1Q = T1h + T1k;
}
{
E To, T18, TC, T1c, Tt, T19, Tz, T1b;
{
E Tl, Tn, TA, TB;
Tl = ri[WS(rs, 2)];
Tn = ii[WS(rs, 2)];
To = FMA(Tk, Tl, Tm * Tn);
T18 = FNMS(Tm, Tl, Tk * Tn);
TA = ri[WS(rs, 3)];
TB = ii[WS(rs, 3)];
TC = FMA(T3, TA, T6 * TB);
T1c = FNMS(T6, TA, T3 * TB);
}
{
E Tq, Ts, Tw, Ty;
Tq = ri[WS(rs, 7)];
Ts = ii[WS(rs, 7)];
Tt = FMA(Tp, Tq, Tr * Ts);
T19 = FNMS(Tr, Tq, Tp * Ts);
Tw = ri[WS(rs, 8)];
Ty = ii[WS(rs, 8)];
Tz = FMA(Tv, Tw, Tx * Ty);
T1b = FNMS(Tx, Tw, Tv * Ty);
}
Tu = To - Tt;
TD = Tz - TC;
TE = Tu + TD;
T1v = T18 + T19;
T1w = T1b + T1c;
T1B = T1v + T1w;
TY = To + Tt;
TZ = Tz + TC;
T10 = TY + TZ;
T1a = T18 - T19;
T1d = T1b - T1c;
T1P = T1a + T1d;
}
{
E T15, TW, T16, T1m, T1o, T1e, T1l, T1n, T17;
T15 = KP559016994 * (TE - TV);
TW = TE + TV;
T16 = FNMS(KP250000000, TW, Tj);
T1e = T1a - T1d;
T1l = T1h - T1k;
T1m = FMA(KP951056516, T1e, KP587785252 * T1l);
T1o = FNMS(KP587785252, T1e, KP951056516 * T1l);
ri[WS(rs, 5)] = Tj + TW;
T1n = T16 - T15;
ri[WS(rs, 7)] = T1n - T1o;
ri[WS(rs, 3)] = T1n + T1o;
T17 = T15 + T16;
ri[WS(rs, 9)] = T17 - T1m;
ri[WS(rs, 1)] = T17 + T1m;
}
{
E T1R, T1T, T1U, T1Y, T20, T1W, T1X, T1Z, T1V;
T1R = KP559016994 * (T1P - T1Q);
T1T = T1P + T1Q;
T1U = FNMS(KP250000000, T1T, T1S);
T1W = Tu - TD;
T1X = TL - TU;
T1Y = FMA(KP951056516, T1W, KP587785252 * T1X);
T20 = FNMS(KP587785252, T1W, KP951056516 * T1X);
ii[WS(rs, 5)] = T1T + T1S;
T1Z = T1U - T1R;
ii[WS(rs, 3)] = T1Z - T20;
ii[WS(rs, 7)] = T20 + T1Z;
T1V = T1R + T1U;
ii[WS(rs, 1)] = T1V - T1Y;
ii[WS(rs, 9)] = T1Y + T1V;
}
{
E T1q, T14, T1p, T1y, T1A, T1u, T1x, T1z, T1r;
T1q = KP559016994 * (T10 - T13);
T14 = T10 + T13;
T1p = FNMS(KP250000000, T14, TX);
T1u = T1s - T1t;
T1x = T1v - T1w;
T1y = FNMS(KP587785252, T1x, KP951056516 * T1u);
T1A = FMA(KP951056516, T1x, KP587785252 * T1u);
ri[0] = TX + T14;
T1z = T1q + T1p;
ri[WS(rs, 4)] = T1z - T1A;
ri[WS(rs, 6)] = T1z + T1A;
T1r = T1p - T1q;
ri[WS(rs, 2)] = T1r - T1y;
ri[WS(rs, 8)] = T1r + T1y;
}
{
E T1L, T1D, T1K, T1J, T1N, T1H, T1I, T1O, T1M;
T1L = KP559016994 * (T1B - T1C);
T1D = T1B + T1C;
T1K = FNMS(KP250000000, T1D, T1G);
T1H = T11 - T12;
T1I = TY - TZ;
T1J = FNMS(KP587785252, T1I, KP951056516 * T1H);
T1N = FMA(KP951056516, T1I, KP587785252 * T1H);
ii[0] = T1D + T1G;
T1O = T1L + T1K;
ii[WS(rs, 4)] = T1N + T1O;
ii[WS(rs, 6)] = T1O - T1N;
T1M = T1K - T1L;
ii[WS(rs, 2)] = T1J + T1M;
ii[WS(rs, 8)] = T1M - T1J;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 76, 42, 38, 0 }, 0, 0, 0 };
void X(codelet_t2_10) (planner *p) {
X(kdft_dit_register) (p, t2_10, &desc);
}
#endif
+836
View File
@@ -0,0 +1,836 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
/*
* This function contains 196 FP additions, 134 FP multiplications,
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
* 90 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
{
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
T2 = W[0];
Tf = W[2];
Tg = T2 * Tf;
TM = W[6];
TN = T2 * TM;
TO = W[7];
TS = T2 * TO;
T3 = W[4];
T4 = T2 * T3;
Tp = Tf * T3;
T6 = W[5];
Ta = T2 * T6;
Tt = Tf * T6;
T5 = W[1];
Th = W[3];
Tl = T2 * Th;
Tz = FMA(T5, Th, Tg);
Ti = FNMS(T5, Th, Tg);
T7 = FMA(T5, T6, T4);
TZ = FNMS(Th, T3, Tt);
TT = FNMS(T5, TM, TS);
Tq = FNMS(Th, T6, Tp);
TW = FMA(Th, T6, Tp);
Tb = FNMS(T5, T3, Ta);
Tu = FMA(Th, T3, Tt);
TP = FMA(T5, TO, TN);
TI = FMA(T5, T3, Ta);
TF = FNMS(T5, T6, T4);
{
E T1y, T1C, T1e, T1i;
T1y = Tz * T3;
T1C = Tz * T6;
TC = FNMS(T5, Tf, Tl);
T1z = FMA(TC, T6, T1y);
T1O = FMA(TC, T3, T1C);
T1D = FNMS(TC, T3, T1C);
T1L = FNMS(TC, T6, T1y);
T1e = Ti * T3;
T1i = Ti * T6;
Tm = FMA(T5, Tf, Tl);
T1f = FMA(Tm, T6, T1e);
T1p = FMA(Tm, T3, T1i);
T1j = FNMS(Tm, T3, T1i);
T1m = FNMS(Tm, T6, T1e);
}
}
{
E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
E T2d, T38;
{
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
T1 = ri[0];
T3z = ii[0];
T8 = ri[WS(rs, 8)];
T9 = T7 * T8;
Tc = ii[WS(rs, 8)];
T3x = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Te = T1 + Td;
T1U = T1 - Td;
T3y = FNMS(Tb, T8, T3x);
T3A = T3y + T3z;
T3L = T3z - T3y;
}
{
E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
T1u = ri[WS(rs, 15)];
T1v = TM * T1u;
T1w = ii[WS(rs, 15)];
T2w = TM * T1w;
T1A = ri[WS(rs, 7)];
T1B = T1z * T1A;
T1E = ii[WS(rs, 7)];
T2y = T1z * T1E;
{
E T1x, T1F, T2x, T2z;
T1x = FMA(TO, T1w, T1v);
T1F = FMA(T1D, T1E, T1B);
T1G = T1x + T1F;
T2D = T1x - T1F;
T2x = FNMS(TO, T1u, T2w);
T2z = FNMS(T1D, T1A, T2y);
T2A = T2x - T2z;
T3h = T2x + T2z;
}
}
{
E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
T1H = ri[WS(rs, 3)];
T1I = Tf * T1H;
T1J = ii[WS(rs, 3)];
T2E = Tf * T1J;
T1M = ri[WS(rs, 11)];
T1N = T1L * T1M;
T1P = ii[WS(rs, 11)];
T2G = T1L * T1P;
{
E T1K, T1Q, T2F, T2H;
T1K = FMA(Th, T1J, T1I);
T1Q = FMA(T1O, T1P, T1N);
T1R = T1K + T1Q;
T2B = T1K - T1Q;
T2F = FNMS(Th, T1H, T2E);
T2H = FNMS(T1O, T1M, T2G);
T2I = T2F - T2H;
T3i = T2F + T2H;
}
}
{
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Tj = ri[WS(rs, 4)];
Tk = Ti * Tj;
Tn = ii[WS(rs, 4)];
T1V = Ti * Tn;
Tr = ri[WS(rs, 12)];
Ts = Tq * Tr;
Tv = ii[WS(rs, 12)];
T1X = Tq * Tv;
{
E To, Tw, T1W, T1Y;
To = FMA(Tm, Tn, Tk);
Tw = FMA(Tu, Tv, Ts);
Tx = To + Tw;
T3M = To - Tw;
T1W = FNMS(Tm, Tj, T1V);
T1Y = FNMS(Tu, Tr, T1X);
T1Z = T1W - T1Y;
T3w = T1W + T1Y;
}
}
{
E TA, TB, TD, T21, TG, TH, TJ, T23;
TA = ri[WS(rs, 2)];
TB = Tz * TA;
TD = ii[WS(rs, 2)];
T21 = Tz * TD;
TG = ri[WS(rs, 10)];
TH = TF * TG;
TJ = ii[WS(rs, 10)];
T23 = TF * TJ;
{
E TE, TK, T22, T24;
TE = FMA(TC, TD, TB);
TK = FMA(TI, TJ, TH);
TL = TE + TK;
T26 = TE - TK;
T22 = FNMS(TC, TA, T21);
T24 = FNMS(TI, TG, T23);
T25 = T22 - T24;
T37 = T22 + T24;
}
}
{
E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
T15 = ri[WS(rs, 1)];
T16 = T2 * T15;
T17 = ii[WS(rs, 1)];
T2h = T2 * T17;
T19 = ri[WS(rs, 9)];
T1a = T3 * T19;
T1b = ii[WS(rs, 9)];
T2j = T3 * T1b;
{
E T18, T1c, T2i, T2k;
T18 = FMA(T5, T17, T16);
T1c = FMA(T6, T1b, T1a);
T1d = T18 + T1c;
T2o = T18 - T1c;
T2i = FNMS(T5, T15, T2h);
T2k = FNMS(T6, T19, T2j);
T2l = T2i - T2k;
T3c = T2i + T2k;
}
}
{
E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
T1g = ri[WS(rs, 5)];
T1h = T1f * T1g;
T1k = ii[WS(rs, 5)];
T2p = T1f * T1k;
T1n = ri[WS(rs, 13)];
T1o = T1m * T1n;
T1q = ii[WS(rs, 13)];
T2r = T1m * T1q;
{
E T1l, T1r, T2q, T2s;
T1l = FMA(T1j, T1k, T1h);
T1r = FMA(T1p, T1q, T1o);
T1s = T1l + T1r;
T2m = T1l - T1r;
T2q = FNMS(T1j, T1g, T2p);
T2s = FNMS(T1p, T1n, T2r);
T2t = T2q - T2s;
T3d = T2q + T2s;
}
}
{
E TQ, TR, TU, T29, TX, TY, T10, T2b;
TQ = ri[WS(rs, 14)];
TR = TP * TQ;
TU = ii[WS(rs, 14)];
T29 = TP * TU;
TX = ri[WS(rs, 6)];
TY = TW * TX;
T10 = ii[WS(rs, 6)];
T2b = TW * T10;
{
E TV, T11, T2a, T2c;
TV = FMA(TT, TU, TR);
T11 = FMA(TZ, T10, TY);
T12 = TV + T11;
T28 = TV - T11;
T2a = FNMS(TT, TQ, T29);
T2c = FNMS(TZ, TX, T2b);
T2d = T2a - T2c;
T38 = T2a + T2c;
}
}
{
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
{
E Ty, T13, T3v, T3B;
Ty = Te + Tx;
T13 = TL + T12;
T14 = Ty + T13;
T3q = Ty - T13;
T3v = T37 + T38;
T3B = T3w + T3A;
T3C = T3v + T3B;
T3E = T3B - T3v;
}
{
E T1t, T1S, T3r, T3s;
T1t = T1d + T1s;
T1S = T1G + T1R;
T1T = T1t + T1S;
T3D = T1S - T1t;
T3r = T3c + T3d;
T3s = T3h + T3i;
T3t = T3r - T3s;
T3u = T3r + T3s;
}
ri[WS(rs, 8)] = T14 - T1T;
ii[WS(rs, 8)] = T3C - T3u;
ri[0] = T14 + T1T;
ii[0] = T3u + T3C;
ri[WS(rs, 12)] = T3q - T3t;
ii[WS(rs, 12)] = T3E - T3D;
ri[WS(rs, 4)] = T3q + T3t;
ii[WS(rs, 4)] = T3D + T3E;
}
{
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
{
E T36, T39, T3F, T3G;
T36 = Te - Tx;
T39 = T37 - T38;
T3a = T36 + T39;
T3m = T36 - T39;
T3F = T12 - TL;
T3G = T3A - T3w;
T3H = T3F + T3G;
T3J = T3G - T3F;
}
{
E T3b, T3e, T3g, T3j;
T3b = T1d - T1s;
T3e = T3c - T3d;
T3f = T3b + T3e;
T3n = T3e - T3b;
T3g = T1G - T1R;
T3j = T3h - T3i;
T3k = T3g - T3j;
T3o = T3g + T3j;
}
{
E T3l, T3I, T3p, T3K;
T3l = T3f + T3k;
ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
T3I = T3n + T3o;
ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
T3p = T3n - T3o;
ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
T3K = T3k - T3f;
ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
}
}
{
E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
E T2O;
{
E T27, T2e, T2n, T2u;
T20 = T1U - T1Z;
T3N = T3L - T3M;
T3T = T3M + T3L;
T2Q = T1U + T1Z;
T27 = T25 - T26;
T2e = T28 + T2d;
T2f = T27 - T2e;
T3O = T27 + T2e;
{
E T2Y, T2Z, T2R, T2S;
T2Y = T2D + T2I;
T2Z = T2A - T2B;
T30 = FNMS(KP414213562, T2Z, T2Y);
T34 = FMA(KP414213562, T2Y, T2Z);
T2R = T26 + T25;
T2S = T28 - T2d;
T2T = T2R + T2S;
T3U = T2S - T2R;
}
T2n = T2l + T2m;
T2u = T2o - T2t;
T2v = FMA(KP414213562, T2u, T2n);
T2N = FNMS(KP414213562, T2n, T2u);
{
E T2V, T2W, T2C, T2J;
T2V = T2o + T2t;
T2W = T2l - T2m;
T2X = FMA(KP414213562, T2W, T2V);
T33 = FNMS(KP414213562, T2V, T2W);
T2C = T2A + T2B;
T2J = T2D - T2I;
T2K = FNMS(KP414213562, T2J, T2C);
T2O = FMA(KP414213562, T2C, T2J);
}
}
{
E T2g, T2L, T3V, T3W;
T2g = FMA(KP707106781, T2f, T20);
T2L = T2v - T2K;
ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
T3V = FMA(KP707106781, T3U, T3T);
T3W = T2O - T2N;
ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
}
{
E T2M, T2P, T3X, T3Y;
T2M = FNMS(KP707106781, T2f, T20);
T2P = T2N + T2O;
ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
T3X = FNMS(KP707106781, T3U, T3T);
T3Y = T2v + T2K;
ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
}
{
E T2U, T31, T3P, T3Q;
T2U = FMA(KP707106781, T2T, T2Q);
T31 = T2X + T30;
ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
T3P = FMA(KP707106781, T3O, T3N);
T3Q = T33 + T34;
ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
}
{
E T32, T35, T3R, T3S;
T32 = FNMS(KP707106781, T2T, T2Q);
T35 = T33 - T34;
ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
T3R = FNMS(KP707106781, T3O, T3N);
T3S = T30 - T2X;
ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_CEXP, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 104, 42, 92, 0 }, 0, 0, 0 };
void X(codelet_t2_16) (planner *p) {
X(kdft_dit_register) (p, t2_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
/*
* This function contains 196 FP additions, 108 FP multiplications,
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
* 82 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
{
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
{
E Th, Tn, Tj, Tm;
T2 = W[0];
T5 = W[1];
Tg = W[2];
Ti = W[3];
Th = T2 * Tg;
Tn = T5 * Tg;
Tj = T5 * Ti;
Tm = T2 * Ti;
Tk = Th - Tj;
To = Tm + Tn;
TE = Tm - Tn;
TC = Th + Tj;
T6 = W[5];
T7 = T5 * T6;
Tv = Tg * T6;
Ta = T2 * T6;
Ts = Ti * T6;
T3 = W[4];
T4 = T2 * T3;
Tw = Ti * T3;
Tb = T5 * T3;
Tr = Tg * T3;
}
T8 = T4 + T7;
TW = Tv - Tw;
TJ = Ta + Tb;
Tt = Tr - Ts;
TU = Tr + Ts;
Tc = Ta - Tb;
Tx = Tv + Tw;
TH = T4 - T7;
TN = W[6];
TO = W[7];
TP = FMA(T2, TN, T5 * TO);
TR = FNMS(T5, TN, T2 * TO);
{
E T1d, T1e, T19, T1a;
T1d = Tk * T6;
T1e = To * T3;
T1f = T1d - T1e;
T1k = T1d + T1e;
T19 = Tk * T3;
T1a = To * T6;
T1b = T19 + T1a;
T1i = T19 - T1a;
}
{
E T1w, T1x, T1s, T1t;
T1w = TC * T6;
T1x = TE * T3;
T1y = T1w - T1x;
T1H = T1w + T1x;
T1s = TC * T3;
T1t = TE * T6;
T1u = T1s + T1t;
T1F = T1s - T1t;
}
}
{
E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
E T2S, T2T, T28, T2A, T2d, T2B;
{
E T1, T3d, Te, T3c, T9, Td;
T1 = ri[0];
T3d = ii[0];
T9 = ri[WS(rs, 8)];
Td = ii[WS(rs, 8)];
Te = FMA(T8, T9, Tc * Td);
T3c = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T3r = T3d - T3c;
T1N = T1 - Te;
T3e = T3c + T3d;
}
{
E Tq, T1O, Tz, T1P;
{
E Tl, Tp, Tu, Ty;
Tl = ri[WS(rs, 4)];
Tp = ii[WS(rs, 4)];
Tq = FMA(Tk, Tl, To * Tp);
T1O = FNMS(To, Tl, Tk * Tp);
Tu = ri[WS(rs, 12)];
Ty = ii[WS(rs, 12)];
Tz = FMA(Tt, Tu, Tx * Ty);
T1P = FNMS(Tx, Tu, Tt * Ty);
}
TA = Tq + Tz;
T3s = Tq - Tz;
T1Q = T1O - T1P;
T3b = T1O + T1P;
}
{
E TG, T1S, TL, T1T, T1U, T1V;
{
E TD, TF, TI, TK;
TD = ri[WS(rs, 2)];
TF = ii[WS(rs, 2)];
TG = FMA(TC, TD, TE * TF);
T1S = FNMS(TE, TD, TC * TF);
TI = ri[WS(rs, 10)];
TK = ii[WS(rs, 10)];
TL = FMA(TH, TI, TJ * TK);
T1T = FNMS(TJ, TI, TH * TK);
}
TM = TG + TL;
T2M = T1S + T1T;
T1U = T1S - T1T;
T1V = TG - TL;
T1W = T1U - T1V;
T2w = T1V + T1U;
}
{
E TT, T1Y, TY, T1Z, T1X, T20;
{
E TQ, TS, TV, TX;
TQ = ri[WS(rs, 14)];
TS = ii[WS(rs, 14)];
TT = FMA(TP, TQ, TR * TS);
T1Y = FNMS(TR, TQ, TP * TS);
TV = ri[WS(rs, 6)];
TX = ii[WS(rs, 6)];
TY = FMA(TU, TV, TW * TX);
T1Z = FNMS(TW, TV, TU * TX);
}
TZ = TT + TY;
T2N = T1Y + T1Z;
T1X = TT - TY;
T20 = T1Y - T1Z;
T21 = T1X + T20;
T2x = T1X - T20;
}
{
E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
{
E T1p, T1q, T1G, T1I;
T1p = ri[WS(rs, 15)];
T1q = ii[WS(rs, 15)];
T1r = FMA(TN, T1p, TO * T1q);
T2k = FNMS(TO, T1p, TN * T1q);
T1G = ri[WS(rs, 11)];
T1I = ii[WS(rs, 11)];
T1J = FMA(T1F, T1G, T1H * T1I);
T2h = FNMS(T1H, T1G, T1F * T1I);
}
{
E T1v, T1z, T1C, T1D;
T1v = ri[WS(rs, 7)];
T1z = ii[WS(rs, 7)];
T1A = FMA(T1u, T1v, T1y * T1z);
T2l = FNMS(T1y, T1v, T1u * T1z);
T1C = ri[WS(rs, 3)];
T1D = ii[WS(rs, 3)];
T1E = FMA(Tg, T1C, Ti * T1D);
T2g = FNMS(Ti, T1C, Tg * T1D);
}
T1B = T1r + T1A;
T1K = T1E + T1J;
T2V = T1B - T1K;
T2W = T2k + T2l;
T2X = T2g + T2h;
T2Y = T2W - T2X;
{
E T2f, T2i, T2m, T2n;
T2f = T1r - T1A;
T2i = T2g - T2h;
T2j = T2f - T2i;
T2D = T2f + T2i;
T2m = T2k - T2l;
T2n = T1E - T1J;
T2o = T2m + T2n;
T2E = T2m - T2n;
}
}
{
E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
{
E T12, T13, T1j, T1l;
T12 = ri[WS(rs, 1)];
T13 = ii[WS(rs, 1)];
T14 = FMA(T2, T12, T5 * T13);
T24 = FNMS(T5, T12, T2 * T13);
T1j = ri[WS(rs, 13)];
T1l = ii[WS(rs, 13)];
T1m = FMA(T1i, T1j, T1k * T1l);
T2b = FNMS(T1k, T1j, T1i * T1l);
}
{
E T15, T16, T1c, T1g;
T15 = ri[WS(rs, 9)];
T16 = ii[WS(rs, 9)];
T17 = FMA(T3, T15, T6 * T16);
T25 = FNMS(T6, T15, T3 * T16);
T1c = ri[WS(rs, 5)];
T1g = ii[WS(rs, 5)];
T1h = FMA(T1b, T1c, T1f * T1g);
T2a = FNMS(T1f, T1c, T1b * T1g);
}
T18 = T14 + T17;
T1n = T1h + T1m;
T2Q = T18 - T1n;
T2R = T24 + T25;
T2S = T2a + T2b;
T2T = T2R - T2S;
{
E T26, T27, T29, T2c;
T26 = T24 - T25;
T27 = T1h - T1m;
T28 = T26 + T27;
T2A = T26 - T27;
T29 = T14 - T17;
T2c = T2a - T2b;
T2d = T29 - T2c;
T2B = T29 + T2c;
}
}
{
E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
{
E T1R, T22, T3y, T3z;
T1R = T1N - T1Q;
T22 = KP707106781 * (T1W - T21);
T23 = T1R + T22;
T2r = T1R - T22;
T3y = KP707106781 * (T2x - T2w);
T3z = T3s + T3r;
T3A = T3y + T3z;
T3C = T3z - T3y;
}
{
E T2e, T2p, T2s, T2t;
T2e = FMA(KP923879532, T28, KP382683432 * T2d);
T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
T2q = T2e + T2p;
T3B = T2p - T2e;
T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
T2u = T2s - T2t;
T3x = T2s + T2t;
}
ri[WS(rs, 11)] = T23 - T2q;
ii[WS(rs, 11)] = T3A - T3x;
ri[WS(rs, 3)] = T23 + T2q;
ii[WS(rs, 3)] = T3x + T3A;
ri[WS(rs, 15)] = T2r - T2u;
ii[WS(rs, 15)] = T3C - T3B;
ri[WS(rs, 7)] = T2r + T2u;
ii[WS(rs, 7)] = T3B + T3C;
}
{
E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
{
E T2L, T2O, T3k, T3l;
T2L = Tf - TA;
T2O = T2M - T2N;
T2P = T2L + T2O;
T31 = T2L - T2O;
T3k = TZ - TM;
T3l = T3e - T3b;
T3m = T3k + T3l;
T3o = T3l - T3k;
}
{
E T2U, T2Z, T32, T33;
T2U = T2Q + T2T;
T2Z = T2V - T2Y;
T30 = KP707106781 * (T2U + T2Z);
T3n = KP707106781 * (T2Z - T2U);
T32 = T2T - T2Q;
T33 = T2V + T2Y;
T34 = KP707106781 * (T32 - T33);
T3j = KP707106781 * (T32 + T33);
}
ri[WS(rs, 10)] = T2P - T30;
ii[WS(rs, 10)] = T3m - T3j;
ri[WS(rs, 2)] = T2P + T30;
ii[WS(rs, 2)] = T3j + T3m;
ri[WS(rs, 14)] = T31 - T34;
ii[WS(rs, 14)] = T3o - T3n;
ri[WS(rs, 6)] = T31 + T34;
ii[WS(rs, 6)] = T3n + T3o;
}
{
E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
{
E T2v, T2y, T3q, T3t;
T2v = T1N + T1Q;
T2y = KP707106781 * (T2w + T2x);
T2z = T2v + T2y;
T2H = T2v - T2y;
T3q = KP707106781 * (T1W + T21);
T3t = T3r - T3s;
T3u = T3q + T3t;
T3w = T3t - T3q;
}
{
E T2C, T2F, T2I, T2J;
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
T2G = T2C + T2F;
T3v = T2F - T2C;
T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
T2K = T2I - T2J;
T3p = T2I + T2J;
}
ri[WS(rs, 9)] = T2z - T2G;
ii[WS(rs, 9)] = T3u - T3p;
ri[WS(rs, 1)] = T2z + T2G;
ii[WS(rs, 1)] = T3p + T3u;
ri[WS(rs, 13)] = T2H - T2K;
ii[WS(rs, 13)] = T3w - T3v;
ri[WS(rs, 5)] = T2H + T2K;
ii[WS(rs, 5)] = T3v + T3w;
}
{
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
{
E TB, T10, T3a, T3f;
TB = Tf + TA;
T10 = TM + TZ;
T11 = TB + T10;
T35 = TB - T10;
T3a = T2M + T2N;
T3f = T3b + T3e;
T3g = T3a + T3f;
T3i = T3f - T3a;
}
{
E T1o, T1L, T36, T37;
T1o = T18 + T1n;
T1L = T1B + T1K;
T1M = T1o + T1L;
T3h = T1L - T1o;
T36 = T2R + T2S;
T37 = T2W + T2X;
T38 = T36 - T37;
T39 = T36 + T37;
}
ri[WS(rs, 8)] = T11 - T1M;
ii[WS(rs, 8)] = T3g - T39;
ri[0] = T11 + T1M;
ii[0] = T39 + T3g;
ri[WS(rs, 12)] = T35 - T38;
ii[WS(rs, 12)] = T3i - T3h;
ri[WS(rs, 4)] = T35 + T38;
ii[WS(rs, 4)] = T3h + T3i;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_CEXP, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 156, 68, 40, 0 }, 0, 0, 0 };
void X(codelet_t2_16) (planner *p) {
X(kdft_dit_register) (p, t2_16, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+200
View File
@@ -0,0 +1,200 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T2, T6, T3, T5, T7, Tb, T4, Ta;
T2 = W[0];
T6 = W[3];
T3 = W[2];
T4 = T2 * T3;
Ta = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tb = FNMS(T5, T3, Ta);
{
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
T1 = ri[0];
Tx = ii[0];
{
E T8, T9, Tc, Tv;
T8 = ri[WS(rs, 2)];
T9 = T7 * T8;
Tc = ii[WS(rs, 2)];
Tv = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Tw = FNMS(Tb, T8, Tv);
}
{
E Tf, Tg, Th, Tp;
Tf = ri[WS(rs, 1)];
Tg = T2 * Tf;
Th = ii[WS(rs, 1)];
Tp = T2 * Th;
Ti = FMA(T5, Th, Tg);
Tq = FNMS(T5, Tf, Tp);
}
{
E Tj, Tk, Tl, Tr;
Tj = ri[WS(rs, 3)];
Tk = T3 * Tj;
Tl = ii[WS(rs, 3)];
Tr = T3 * Tl;
Tm = FMA(T6, Tl, Tk);
Ts = FNMS(T6, Tj, Tr);
}
{
E Te, Tn, Tu, Ty;
Te = T1 + Td;
Tn = Ti + Tm;
ri[WS(rs, 2)] = Te - Tn;
ri[0] = Te + Tn;
Tu = Tq + Ts;
Ty = Tw + Tx;
ii[0] = Tu + Ty;
ii[WS(rs, 2)] = Ty - Tu;
}
{
E To, Tt, Tz, TA;
To = T1 - Td;
Tt = Tq - Ts;
ri[WS(rs, 3)] = To - Tt;
ri[WS(rs, 1)] = To + Tt;
Tz = Tx - Tw;
TA = Ti - Tm;
ii[WS(rs, 1)] = Tz - TA;
ii[WS(rs, 3)] = TA + Tz;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
void X(codelet_t2_4) (planner *p) {
X(kdft_dit_register) (p, t2_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T2, T4, T3, T5, T6, T8;
T2 = W[0];
T4 = W[1];
T3 = W[2];
T5 = W[3];
T6 = FMA(T2, T3, T4 * T5);
T8 = FNMS(T4, T3, T2 * T5);
{
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
T1 = ri[0];
Tp = ii[0];
T7 = ri[WS(rs, 2)];
T9 = ii[WS(rs, 2)];
Ta = FMA(T6, T7, T8 * T9);
To = FNMS(T8, T7, T6 * T9);
{
E Tc, Td, Tf, Tg;
Tc = ri[WS(rs, 1)];
Td = ii[WS(rs, 1)];
Te = FMA(T2, Tc, T4 * Td);
Tk = FNMS(T4, Tc, T2 * Td);
Tf = ri[WS(rs, 3)];
Tg = ii[WS(rs, 3)];
Th = FMA(T3, Tf, T5 * Tg);
Tl = FNMS(T5, Tf, T3 * Tg);
}
{
E Tb, Ti, Tn, Tq;
Tb = T1 + Ta;
Ti = Te + Th;
ri[WS(rs, 2)] = Tb - Ti;
ri[0] = Tb + Ti;
Tn = Tk + Tl;
Tq = To + Tp;
ii[0] = Tn + Tq;
ii[WS(rs, 2)] = Tq - Tn;
}
{
E Tj, Tm, Tr, Ts;
Tj = T1 - Ta;
Tm = Tk - Tl;
ri[WS(rs, 3)] = Tj - Tm;
ri[WS(rs, 1)] = Tj + Tm;
Tr = Tp - To;
Ts = Te - Th;
ii[WS(rs, 1)] = Tr - Ts;
ii[WS(rs, 3)] = Ts + Tr;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
void X(codelet_t2_4) (planner *p) {
X(kdft_dit_register) (p, t2_4, &desc);
}
#endif
+264
View File
@@ -0,0 +1,264 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
/*
* This function contains 44 FP additions, 40 FP multiplications,
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
* 38 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
T2 = W[0];
Ta = W[3];
T8 = W[2];
T9 = T2 * T8;
Te = T2 * Ta;
T5 = W[1];
Tb = FNMS(T5, Ta, T9);
Tm = FNMS(T5, T8, Te);
Tf = FMA(T5, T8, Te);
Tj = FMA(T5, Ta, T9);
{
E T1, TO, T7, Th, Ti, Tz, TB, TL, To, Ts, Tt, TE, TG, TM;
T1 = ri[0];
TO = ii[0];
{
E T3, T4, T6, Ty, Tc, Td, Tg, TA;
T3 = ri[WS(rs, 1)];
T4 = T2 * T3;
T6 = ii[WS(rs, 1)];
Ty = T2 * T6;
Tc = ri[WS(rs, 4)];
Td = Tb * Tc;
Tg = ii[WS(rs, 4)];
TA = Tb * Tg;
T7 = FMA(T5, T6, T4);
Th = FMA(Tf, Tg, Td);
Ti = T7 + Th;
Tz = FNMS(T5, T3, Ty);
TB = FNMS(Tf, Tc, TA);
TL = Tz + TB;
}
{
E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
Tk = ri[WS(rs, 2)];
Tl = Tj * Tk;
Tn = ii[WS(rs, 2)];
TD = Tj * Tn;
Tp = ri[WS(rs, 3)];
Tq = T8 * Tp;
Tr = ii[WS(rs, 3)];
TF = T8 * Tr;
To = FMA(Tm, Tn, Tl);
Ts = FMA(Ta, Tr, Tq);
Tt = To + Ts;
TE = FNMS(Tm, Tk, TD);
TG = FNMS(Ta, Tp, TF);
TM = TE + TG;
}
{
E Tw, Tu, Tv, TI, TK, TC, TH, TJ, Tx;
Tw = Ti - Tt;
Tu = Ti + Tt;
Tv = FNMS(KP250000000, Tu, T1);
TC = Tz - TB;
TH = TE - TG;
TI = FMA(KP618033988, TH, TC);
TK = FNMS(KP618033988, TC, TH);
ri[0] = T1 + Tu;
TJ = FNMS(KP559016994, Tw, Tv);
ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ);
Tx = FMA(KP559016994, Tw, Tv);
ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx);
ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
}
{
E TQ, TN, TP, TU, TW, TS, TT, TV, TR;
TQ = TL - TM;
TN = TL + TM;
TP = FNMS(KP250000000, TN, TO);
TS = T7 - Th;
TT = To - Ts;
TU = FMA(KP618033988, TT, TS);
TW = FNMS(KP618033988, TS, TT);
ii[0] = TN + TO;
TV = FNMS(KP559016994, TQ, TP);
ii[WS(rs, 2)] = FMA(KP951056516, TW, TV);
ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV);
TR = FMA(KP559016994, TQ, TP);
ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR);
ii[WS(rs, 4)] = FMA(KP951056516, TU, TR);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 14, 10, 30, 0 }, 0, 0, 0 };
void X(codelet_t2_5) (planner *p) {
X(kdft_dit_register) (p, t2_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
/*
* This function contains 44 FP additions, 32 FP multiplications,
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
* 37 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
{
E T8, Te, Ta, Td;
T2 = W[0];
T4 = W[1];
T7 = W[2];
T9 = W[3];
T8 = T2 * T7;
Te = T4 * T7;
Ta = T4 * T9;
Td = T2 * T9;
Tb = T8 - Ta;
Tl = Td - Te;
Tf = Td + Te;
Tj = T8 + Ta;
}
{
E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts;
T1 = ri[0];
TI = ii[0];
{
E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
{
E T3, T5, To, Tp;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T6 = FMA(T2, T3, T4 * T5);
Tw = FNMS(T4, T3, T2 * T5);
To = ri[WS(rs, 3)];
Tp = ii[WS(rs, 3)];
Tq = FMA(T7, To, T9 * Tp);
TA = FNMS(T9, To, T7 * Tp);
}
{
E Tc, Tg, Tk, Tm;
Tc = ri[WS(rs, 4)];
Tg = ii[WS(rs, 4)];
Th = FMA(Tb, Tc, Tf * Tg);
Tx = FNMS(Tf, Tc, Tb * Tg);
Tk = ri[WS(rs, 2)];
Tm = ii[WS(rs, 2)];
Tn = FMA(Tj, Tk, Tl * Tm);
Tz = FNMS(Tl, Tk, Tj * Tm);
}
Ty = Tw - Tx;
TB = Tz - TA;
TN = Tn - Tq;
TM = T6 - Th;
TF = Tw + Tx;
TG = Tz + TA;
TH = TF + TG;
Ti = T6 + Th;
Tr = Tn + Tq;
Ts = Ti + Tr;
}
ri[0] = T1 + Ts;
ii[0] = TH + TI;
{
E TC, TE, Tv, TD, Tt, Tu;
TC = FMA(KP951056516, Ty, KP587785252 * TB);
TE = FNMS(KP587785252, Ty, KP951056516 * TB);
Tt = KP559016994 * (Ti - Tr);
Tu = FNMS(KP250000000, Ts, T1);
Tv = Tt + Tu;
TD = Tu - Tt;
ri[WS(rs, 4)] = Tv - TC;
ri[WS(rs, 3)] = TD + TE;
ri[WS(rs, 1)] = Tv + TC;
ri[WS(rs, 2)] = TD - TE;
}
{
E TO, TP, TL, TQ, TJ, TK;
TO = FMA(KP951056516, TM, KP587785252 * TN);
TP = FNMS(KP587785252, TM, KP951056516 * TN);
TJ = KP559016994 * (TF - TG);
TK = FNMS(KP250000000, TH, TI);
TL = TJ + TK;
TQ = TK - TJ;
ii[WS(rs, 1)] = TL - TO;
ii[WS(rs, 3)] = TQ - TP;
ii[WS(rs, 4)] = TO + TL;
ii[WS(rs, 2)] = TP + TQ;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 30, 18, 14, 0 }, 0, 0, 0 };
void X(codelet_t2_5) (planner *p) {
X(kdft_dit_register) (p, t2_5, &desc);
}
#endif
File diff suppressed because it is too large Load Diff
+390
View File
@@ -0,0 +1,390 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
/*
* This function contains 74 FP additions, 50 FP multiplications,
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
* 48 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
{
E T4, Tm, Tr, Ta, TB, TF;
T2 = W[0];
T3 = W[2];
T4 = T2 * T3;
Tl = W[4];
Tm = T2 * Tl;
Tn = W[5];
Tr = T2 * Tn;
T5 = W[1];
T6 = W[3];
Ta = T2 * T6;
Tf = FMA(T5, T6, T4);
T7 = FNMS(T5, T6, T4);
Ts = FNMS(T5, Tl, Tr);
Tb = FMA(T5, T3, Ta);
To = FMA(T5, Tn, Tm);
TB = Tf * Tl;
TF = Tf * Tn;
Ti = FNMS(T5, T3, Ta);
TC = FMA(Ti, Tn, TB);
TG = FNMS(Ti, Tl, TF);
}
{
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
E TI, T11, T13, T15, T16;
T1 = ri[0];
T1s = ii[0];
{
E T8, T9, Tc, T1q;
T8 = ri[WS(rs, 4)];
T9 = T7 * T8;
Tc = ii[WS(rs, 4)];
T1q = T7 * Tc;
Td = FMA(Tb, Tc, T9);
T1r = FNMS(Tb, T8, T1q);
}
{
E Tp, Tq, Tt, TX;
Tp = ri[WS(rs, 6)];
Tq = To * Tp;
Tt = ii[WS(rs, 6)];
TX = To * Tt;
Tu = FMA(Ts, Tt, Tq);
TY = FNMS(Ts, Tp, TX);
}
{
E Tg, Th, Tj, TV;
Tg = ri[WS(rs, 2)];
Th = Tf * Tg;
Tj = ii[WS(rs, 2)];
TV = Tf * Tj;
Tk = FMA(Ti, Tj, Th);
TW = FNMS(Ti, Tg, TV);
}
{
E TK, TL, TM, T19, TO, TP, TQ, T1b;
TK = ri[WS(rs, 7)];
TL = Tl * TK;
TM = ii[WS(rs, 7)];
T19 = Tl * TM;
TO = ri[WS(rs, 3)];
TP = T3 * TO;
TQ = ii[WS(rs, 3)];
T1b = T3 * TQ;
TN = FMA(Tn, TM, TL);
TR = FMA(T6, TQ, TP);
T18 = TN - TR;
T1a = FNMS(Tn, TK, T19);
T1c = FNMS(T6, TO, T1b);
T1d = T1a - T1c;
}
{
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
Tx = ri[WS(rs, 1)];
Ty = T2 * Tx;
Tz = ii[WS(rs, 1)];
T12 = T2 * Tz;
TD = ri[WS(rs, 5)];
TE = TC * TD;
TH = ii[WS(rs, 5)];
T14 = TC * TH;
TA = FMA(T5, Tz, Ty);
TI = FMA(TG, TH, TE);
T11 = TA - TI;
T13 = FNMS(T5, Tx, T12);
T15 = FNMS(TG, TD, T14);
T16 = T13 - T15;
}
{
E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
{
E TU, TZ, T1x, T1y;
TU = T1 - Td;
TZ = TW - TY;
T10 = TU + TZ;
T1g = TU - TZ;
T1x = T1s - T1r;
T1y = Tk - Tu;
T1z = T1x - T1y;
T1B = T1y + T1x;
}
{
E T17, T1e, T1h, T1i;
T17 = T11 + T16;
T1e = T18 - T1d;
T1f = T17 + T1e;
T1C = T1e - T17;
T1h = T16 - T11;
T1i = T18 + T1d;
T1j = T1h - T1i;
T1A = T1h + T1i;
}
ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
}
{
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
{
E Te, Tv, T1p, T1t;
Te = T1 + Td;
Tv = Tk + Tu;
Tw = Te + Tv;
T1k = Te - Tv;
T1p = TW + TY;
T1t = T1r + T1s;
T1u = T1p + T1t;
T1w = T1t - T1p;
}
{
E TJ, TS, T1l, T1m;
TJ = TA + TI;
TS = TN + TR;
TT = TJ + TS;
T1v = TS - TJ;
T1l = T13 + T15;
T1m = T1a + T1c;
T1n = T1l - T1m;
T1o = T1l + T1m;
}
ri[WS(rs, 4)] = Tw - TT;
ii[WS(rs, 4)] = T1u - T1o;
ri[0] = Tw + TT;
ii[0] = T1o + T1u;
ri[WS(rs, 6)] = T1k - T1n;
ii[WS(rs, 6)] = T1w - T1v;
ri[WS(rs, 2)] = T1k + T1n;
ii[WS(rs, 2)] = T1v + T1w;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 44, 20, 30, 0 }, 0, 0, 0 };
void X(codelet_t2_8) (planner *p) {
X(kdft_dit_register) (p, t2_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
/*
* This function contains 74 FP additions, 44 FP multiplications,
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
* 42 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
{
E T4, Tb, T7, Ta;
T2 = W[0];
T5 = W[1];
T3 = W[2];
T6 = W[3];
T4 = T2 * T3;
Tb = T5 * T3;
T7 = T5 * T6;
Ta = T2 * T6;
T8 = T4 - T7;
Tc = Ta + Tb;
Tg = T4 + T7;
Ti = Ta - Tb;
Tl = W[4];
Tm = W[5];
Tn = FMA(T2, Tl, T5 * Tm);
Tz = FNMS(Ti, Tl, Tg * Tm);
Tp = FNMS(T5, Tl, T2 * Tm);
Tx = FMA(Tg, Tl, Ti * Tm);
}
{
E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
E TT;
{
E T1, T1c, Te, T1b, T9, Td;
T1 = ri[0];
T1c = ii[0];
T9 = ri[WS(rs, 4)];
Td = ii[WS(rs, 4)];
Te = FMA(T8, T9, Tc * Td);
T1b = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T1i = T1c - T1b;
TL = T1 - Te;
T1d = T1b + T1c;
}
{
E TF, TW, TI, TX;
{
E TD, TE, TG, TH;
TD = ri[WS(rs, 7)];
TE = ii[WS(rs, 7)];
TF = FMA(Tl, TD, Tm * TE);
TW = FNMS(Tm, TD, Tl * TE);
TG = ri[WS(rs, 3)];
TH = ii[WS(rs, 3)];
TI = FMA(T3, TG, T6 * TH);
TX = FNMS(T6, TG, T3 * TH);
}
TJ = TF + TI;
T17 = TW + TX;
TV = TF - TI;
TY = TW - TX;
}
{
E Tk, TM, Tr, TN;
{
E Th, Tj, To, Tq;
Th = ri[WS(rs, 2)];
Tj = ii[WS(rs, 2)];
Tk = FMA(Tg, Th, Ti * Tj);
TM = FNMS(Ti, Th, Tg * Tj);
To = ri[WS(rs, 6)];
Tq = ii[WS(rs, 6)];
Tr = FMA(Tn, To, Tp * Tq);
TN = FNMS(Tp, To, Tn * Tq);
}
Ts = Tk + Tr;
T1j = Tk - Tr;
TO = TM - TN;
T1a = TM + TN;
}
{
E Tw, TR, TB, TS;
{
E Tu, Tv, Ty, TA;
Tu = ri[WS(rs, 1)];
Tv = ii[WS(rs, 1)];
Tw = FMA(T2, Tu, T5 * Tv);
TR = FNMS(T5, Tu, T2 * Tv);
Ty = ri[WS(rs, 5)];
TA = ii[WS(rs, 5)];
TB = FMA(Tx, Ty, Tz * TA);
TS = FNMS(Tz, Ty, Tx * TA);
}
TC = Tw + TB;
T16 = TR + TS;
TQ = Tw - TB;
TT = TR - TS;
}
{
E Tt, TK, T1f, T1g;
Tt = Tf + Ts;
TK = TC + TJ;
ri[WS(rs, 4)] = Tt - TK;
ri[0] = Tt + TK;
{
E T19, T1e, T15, T18;
T19 = T16 + T17;
T1e = T1a + T1d;
ii[0] = T19 + T1e;
ii[WS(rs, 4)] = T1e - T19;
T15 = Tf - Ts;
T18 = T16 - T17;
ri[WS(rs, 6)] = T15 - T18;
ri[WS(rs, 2)] = T15 + T18;
}
T1f = TJ - TC;
T1g = T1d - T1a;
ii[WS(rs, 2)] = T1f + T1g;
ii[WS(rs, 6)] = T1g - T1f;
{
E T11, T1k, T14, T1h, T12, T13;
T11 = TL - TO;
T1k = T1i - T1j;
T12 = TT - TQ;
T13 = TV + TY;
T14 = KP707106781 * (T12 - T13);
T1h = KP707106781 * (T12 + T13);
ri[WS(rs, 7)] = T11 - T14;
ii[WS(rs, 5)] = T1k - T1h;
ri[WS(rs, 3)] = T11 + T14;
ii[WS(rs, 1)] = T1h + T1k;
}
{
E TP, T1m, T10, T1l, TU, TZ;
TP = TL + TO;
T1m = T1j + T1i;
TU = TQ + TT;
TZ = TV - TY;
T10 = KP707106781 * (TU + TZ);
T1l = KP707106781 * (TZ - TU);
ri[WS(rs, 5)] = TP - T10;
ii[WS(rs, 7)] = T1m - T1l;
ri[WS(rs, 1)] = TP + T10;
ii[WS(rs, 3)] = T1l + T1m;
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 56, 26, 18, 0 }, 0, 0, 0 };
void X(codelet_t2_8) (planner *p) {
X(kdft_dit_register) (p, t2_8, &desc);
}
#endif
+1
View File
@@ -0,0 +1 @@
#include "dft/scalar/t.h" /* same stuff, no need to duplicate */
+39
View File
@@ -0,0 +1,39 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/codelet-dft.h"
#include "dft/scalar/n.h"
static int okp(const kdft_desc *d,
const R *ri, const R *ii,
const R *ro, const R *io,
INT is, INT os, INT vl, INT ivs, INT ovs,
const planner *plnr)
{
UNUSED(ri); UNUSED(ii); UNUSED(ro); UNUSED(io); UNUSED(vl); UNUSED(plnr);
return (1
&& (!d->is || (d->is == is))
&& (!d->os || (d->os == os))
&& (!d->ivs || (d->ivs == ivs))
&& (!d->ovs || (d->ovs == ovs))
);
}
const kdft_genus GENUS = { okp, 1 };
+23
View File
@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(dft_n_genus)
extern const kdft_genus GENUS;
+1
View File
@@ -0,0 +1 @@
#include "dft/scalar/t.h" /* same stuff, no need to duplicate */
+37
View File
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/codelet-dft.h"
#include "dft/scalar/t.h"
static int okp(const ct_desc *d,
const R *rio, const R *iio,
INT rs, INT vs, INT m, INT mb, INT me, INT ms,
const planner *plnr)
{
UNUSED(rio); UNUSED(iio); UNUSED(m); UNUSED(mb); UNUSED(me); UNUSED(plnr);
return (1
&& (!d->rs || (d->rs == rs))
&& (!d->vs || (d->vs == vs))
&& (!d->ms || (d->ms == ms))
);
}
const ct_genus GENUS = { okp, 1 };
+23
View File
@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(dft_t_genus)
extern const ct_genus GENUS;
+4
View File
@@ -0,0 +1,4 @@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h \
t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
+666
View File
@@ -0,0 +1,666 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = dft/simd
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
SOURCES =
DIST_SOURCES =
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
ctags-recursive dvi-recursive html-recursive info-recursive \
install-data-recursive install-dvi-recursive \
install-exec-recursive install-html-recursive \
install-info-recursive install-pdf-recursive \
install-ps-recursive install-recursive installcheck-recursive \
installdirs-recursive pdf-recursive ps-recursive \
tags-recursive uninstall-recursive
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
distclean-recursive maintainer-clean-recursive
am__recursive_targets = \
$(RECURSIVE_TARGETS) \
$(RECURSIVE_CLEAN_TARGETS) \
$(am__extra_recursive_targets)
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
distdir distdir-am
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
am__relativize = \
dir0=`pwd`; \
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
sed_rest='s,^[^/]*/*,,'; \
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
sed_butlast='s,/*[^/]*$$,,'; \
while test -n "$$dir1"; do \
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
if test "$$first" != "."; then \
if test "$$first" = ".."; then \
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
else \
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
if test "$$first2" = "$$first"; then \
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
else \
dir2="../$$dir2"; \
fi; \
dir0="$$dir0"/"$$first"; \
fi; \
fi; \
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
done; \
reldir="$$dir2"
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = common sse2 avx avx-128-fma avx2 avx2-128 avx512 kcvi altivec vsx neon generic-simd128 generic-simd256
EXTRA_DIST = n1b.h n1f.h n2b.h n2f.h n2s.h q1b.h q1f.h t1b.h t1bu.h \
t1f.h t1fu.h t2b.h t2f.h t3b.h t3f.h ts.h codlist.mk simd.mk
all: all-recursive
.SUFFIXES:
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/simd/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu dft/simd/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
# This directory's subdirectories are mostly independent; you can cd
# into them and run 'make' without going through this Makefile.
# To change the values of 'make' variables: instead of editing Makefiles,
# (1) if the variable is set in 'config.status', edit 'config.status'
# (which will cause the Makefiles to be regenerated when you run 'make');
# (2) otherwise, pass the desired values on the 'make' command line.
$(am__recursive_targets):
@fail=; \
if $(am__make_keepgoing); then \
failcom='fail=yes'; \
else \
failcom='exit 1'; \
fi; \
dot_seen=no; \
target=`echo $@ | sed s/-recursive//`; \
case "$@" in \
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
*) list='$(SUBDIRS)' ;; \
esac; \
for subdir in $$list; do \
echo "Making $$target in $$subdir"; \
if test "$$subdir" = "."; then \
dot_seen=yes; \
local_target="$$target-am"; \
else \
local_target="$$target"; \
fi; \
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|| eval $$failcom; \
done; \
if test "$$dot_seen" = "no"; then \
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
fi; test -z "$$fail"
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-recursive
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
include_option=--etags-include; \
empty_fix=.; \
else \
include_option=--include; \
empty_fix=; \
fi; \
list='$(SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
test ! -f $$subdir/TAGS || \
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
fi; \
done; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-recursive
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-recursive
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
$(am__make_dryrun) \
|| test -d "$(distdir)/$$subdir" \
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|| exit 1; \
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
$(am__relativize); \
new_distdir=$$reldir; \
dir1=$$subdir; dir2="$(top_distdir)"; \
$(am__relativize); \
new_top_distdir=$$reldir; \
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
($(am__cd) $$subdir && \
$(MAKE) $(AM_MAKEFLAGS) \
top_distdir="$$new_top_distdir" \
distdir="$$new_distdir" \
am__remove_distdir=: \
am__skip_length_check=: \
am__skip_mode_fix=: \
distdir) \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-recursive
all-am: Makefile
installdirs: installdirs-recursive
installdirs-am:
install: install-recursive
install-exec: install-exec-recursive
install-data: install-data-recursive
uninstall: uninstall-recursive
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-recursive
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-recursive
clean-am: clean-generic clean-libtool mostlyclean-am
distclean: distclean-recursive
-rm -f Makefile
distclean-am: clean-am distclean-generic distclean-tags
dvi: dvi-recursive
dvi-am:
html: html-recursive
html-am:
info: info-recursive
info-am:
install-data-am:
install-dvi: install-dvi-recursive
install-dvi-am:
install-exec-am:
install-html: install-html-recursive
install-html-am:
install-info: install-info-recursive
install-info-am:
install-man:
install-pdf: install-pdf-recursive
install-pdf-am:
install-ps: install-ps-recursive
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-recursive
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-recursive
mostlyclean-am: mostlyclean-generic mostlyclean-libtool
pdf: pdf-recursive
pdf-am:
ps: ps-recursive
ps-am:
uninstall-am:
.MAKE: $(am__recursive_targets) install-am install-strip
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
check-am clean clean-generic clean-libtool cscopelist-am ctags \
ctags-am distclean distclean-generic distclean-libtool \
distclean-tags distdir dvi dvi-am html html-am info info-am \
install install-am install-data install-data-am install-dvi \
install-dvi-am install-exec install-exec-am install-html \
install-html-am install-info install-info-am install-man \
install-pdf install-pdf-am install-ps install-ps-am \
install-strip installcheck installcheck-am installdirs \
installdirs-am maintainer-clean maintainer-clean-generic \
mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \
ps ps-am tags tags-am uninstall uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
+13
View File
@@ -0,0 +1,13 @@
AM_CFLAGS = $(ALTIVEC_CFLAGS)
SIMD_HEADER=simd-support/simd-altivec.h
include $(top_srcdir)/dft/simd/codlist.mk
include $(top_srcdir)/dft/simd/simd.mk
if HAVE_ALTIVEC
BUILT_SOURCES = $(EXTRA_DIST)
noinst_LTLIBRARIES = libdft_altivec_codelets.la
libdft_altivec_codelets_la_SOURCES = $(BUILT_SOURCES)
endif
File diff suppressed because it is too large Load Diff
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/codlist.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/genus.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_10.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_11.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_12.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_128.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_13.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_14.c"
+3
View File
@@ -0,0 +1,3 @@
/* Generated automatically. DO NOT EDIT! */
#define SIMD_HEADER "simd-support/simd-altivec.h"
#include "../common/n1bv_15.c"

Some files were not shown because too many files have changed in this diff Show More