Updates
This commit is contained in:
15
fftw-3.3.10/rdft/Makefile.am
Normal file
15
fftw-3.3.10/rdft/Makefile.am
Normal file
@@ -0,0 +1,15 @@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = scalar simd
|
||||
|
||||
noinst_LTLIBRARIES = librdft.la
|
||||
|
||||
RDFT2 = buffered2.c direct2.c nop2.c rank0-rdft2.c rank-geq2-rdft2.c \
|
||||
plan2.c problem2.c solve2.c vrank-geq1-rdft2.c rdft2-rdft.c \
|
||||
rdft2-tensor-max-index.c rdft2-inplace-strides.c rdft2-strides.c \
|
||||
khc2c.c ct-hc2c.h ct-hc2c.c ct-hc2c-direct.c
|
||||
|
||||
librdft_la_SOURCES = hc2hc.h hc2hc.c dft-r2hc.c dht-r2hc.c dht-rader.c \
|
||||
buffered.c codelet-rdft.h conf.c direct-r2r.c direct-r2c.c generic.c \
|
||||
hc2hc-direct.c hc2hc-generic.c khc2hc.c kr2c.c kr2r.c indirect.c nop.c \
|
||||
plan.c problem.c rank0.c rank-geq2.c rdft.h rdft-dht.c solve.c \
|
||||
vrank-geq1.c vrank3-transpose.c $(RDFT2)
|
||||
910
fftw-3.3.10/rdft/Makefile.in
Normal file
910
fftw-3.3.10/rdft/Makefile.in
Normal file
@@ -0,0 +1,910 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = rdft
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
librdft_la_LIBADD =
|
||||
am__objects_1 = buffered2.lo direct2.lo nop2.lo rank0-rdft2.lo \
|
||||
rank-geq2-rdft2.lo plan2.lo problem2.lo solve2.lo \
|
||||
vrank-geq1-rdft2.lo rdft2-rdft.lo rdft2-tensor-max-index.lo \
|
||||
rdft2-inplace-strides.lo rdft2-strides.lo khc2c.lo ct-hc2c.lo \
|
||||
ct-hc2c-direct.lo
|
||||
am_librdft_la_OBJECTS = hc2hc.lo dft-r2hc.lo dht-r2hc.lo dht-rader.lo \
|
||||
buffered.lo conf.lo direct-r2r.lo direct-r2c.lo generic.lo \
|
||||
hc2hc-direct.lo hc2hc-generic.lo khc2hc.lo kr2c.lo kr2r.lo \
|
||||
indirect.lo nop.lo plan.lo problem.lo rank0.lo rank-geq2.lo \
|
||||
rdft-dht.lo solve.lo vrank-geq1.lo vrank3-transpose.lo \
|
||||
$(am__objects_1)
|
||||
librdft_la_OBJECTS = $(am_librdft_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/buffered.Plo \
|
||||
./$(DEPDIR)/buffered2.Plo ./$(DEPDIR)/conf.Plo \
|
||||
./$(DEPDIR)/ct-hc2c-direct.Plo ./$(DEPDIR)/ct-hc2c.Plo \
|
||||
./$(DEPDIR)/dft-r2hc.Plo ./$(DEPDIR)/dht-r2hc.Plo \
|
||||
./$(DEPDIR)/dht-rader.Plo ./$(DEPDIR)/direct-r2c.Plo \
|
||||
./$(DEPDIR)/direct-r2r.Plo ./$(DEPDIR)/direct2.Plo \
|
||||
./$(DEPDIR)/generic.Plo ./$(DEPDIR)/hc2hc-direct.Plo \
|
||||
./$(DEPDIR)/hc2hc-generic.Plo ./$(DEPDIR)/hc2hc.Plo \
|
||||
./$(DEPDIR)/indirect.Plo ./$(DEPDIR)/khc2c.Plo \
|
||||
./$(DEPDIR)/khc2hc.Plo ./$(DEPDIR)/kr2c.Plo \
|
||||
./$(DEPDIR)/kr2r.Plo ./$(DEPDIR)/nop.Plo ./$(DEPDIR)/nop2.Plo \
|
||||
./$(DEPDIR)/plan.Plo ./$(DEPDIR)/plan2.Plo \
|
||||
./$(DEPDIR)/problem.Plo ./$(DEPDIR)/problem2.Plo \
|
||||
./$(DEPDIR)/rank-geq2-rdft2.Plo ./$(DEPDIR)/rank-geq2.Plo \
|
||||
./$(DEPDIR)/rank0-rdft2.Plo ./$(DEPDIR)/rank0.Plo \
|
||||
./$(DEPDIR)/rdft-dht.Plo ./$(DEPDIR)/rdft2-inplace-strides.Plo \
|
||||
./$(DEPDIR)/rdft2-rdft.Plo ./$(DEPDIR)/rdft2-strides.Plo \
|
||||
./$(DEPDIR)/rdft2-tensor-max-index.Plo ./$(DEPDIR)/solve.Plo \
|
||||
./$(DEPDIR)/solve2.Plo ./$(DEPDIR)/vrank-geq1-rdft2.Plo \
|
||||
./$(DEPDIR)/vrank-geq1.Plo ./$(DEPDIR)/vrank3-transpose.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(librdft_la_SOURCES)
|
||||
DIST_SOURCES = $(librdft_la_SOURCES)
|
||||
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
|
||||
ctags-recursive dvi-recursive html-recursive info-recursive \
|
||||
install-data-recursive install-dvi-recursive \
|
||||
install-exec-recursive install-html-recursive \
|
||||
install-info-recursive install-pdf-recursive \
|
||||
install-ps-recursive install-recursive installcheck-recursive \
|
||||
installdirs-recursive pdf-recursive ps-recursive \
|
||||
tags-recursive uninstall-recursive
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
|
||||
distclean-recursive maintainer-clean-recursive
|
||||
am__recursive_targets = \
|
||||
$(RECURSIVE_TARGETS) \
|
||||
$(RECURSIVE_CLEAN_TARGETS) \
|
||||
$(am__extra_recursive_targets)
|
||||
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
|
||||
distdir distdir-am
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
DIST_SUBDIRS = $(SUBDIRS)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
am__relativize = \
|
||||
dir0=`pwd`; \
|
||||
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
|
||||
sed_rest='s,^[^/]*/*,,'; \
|
||||
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
|
||||
sed_butlast='s,/*[^/]*$$,,'; \
|
||||
while test -n "$$dir1"; do \
|
||||
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
|
||||
if test "$$first" != "."; then \
|
||||
if test "$$first" = ".."; then \
|
||||
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
|
||||
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
|
||||
else \
|
||||
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
|
||||
if test "$$first2" = "$$first"; then \
|
||||
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
|
||||
else \
|
||||
dir2="../$$dir2"; \
|
||||
fi; \
|
||||
dir0="$$dir0"/"$$first"; \
|
||||
fi; \
|
||||
fi; \
|
||||
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
|
||||
done; \
|
||||
reldir="$$dir2"
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = scalar simd
|
||||
noinst_LTLIBRARIES = librdft.la
|
||||
RDFT2 = buffered2.c direct2.c nop2.c rank0-rdft2.c rank-geq2-rdft2.c \
|
||||
plan2.c problem2.c solve2.c vrank-geq1-rdft2.c rdft2-rdft.c \
|
||||
rdft2-tensor-max-index.c rdft2-inplace-strides.c rdft2-strides.c \
|
||||
khc2c.c ct-hc2c.h ct-hc2c.c ct-hc2c-direct.c
|
||||
|
||||
librdft_la_SOURCES = hc2hc.h hc2hc.c dft-r2hc.c dht-r2hc.c dht-rader.c \
|
||||
buffered.c codelet-rdft.h conf.c direct-r2r.c direct-r2c.c generic.c \
|
||||
hc2hc-direct.c hc2hc-generic.c khc2hc.c kr2c.c kr2r.c indirect.c nop.c \
|
||||
plan.c problem.c rank0.c rank-geq2.c rdft.h rdft-dht.c solve.c \
|
||||
vrank-geq1.c vrank3-transpose.c $(RDFT2)
|
||||
|
||||
all: all-recursive
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu rdft/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
librdft.la: $(librdft_la_OBJECTS) $(librdft_la_DEPENDENCIES) $(EXTRA_librdft_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(librdft_la_OBJECTS) $(librdft_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct-hc2c-direct.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct-hc2c.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-r2hc.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dht-r2hc.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dht-rader.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct-r2c.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct-r2r.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/generic.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc-direct.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc-generic.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/khc2c.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/khc2hc.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kr2c.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kr2r.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2-rdft2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank0-rdft2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank0.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-dht.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-inplace-strides.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-rdft.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-strides.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-tensor-max-index.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1-rdft2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank3-transpose.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
# This directory's subdirectories are mostly independent; you can cd
|
||||
# into them and run 'make' without going through this Makefile.
|
||||
# To change the values of 'make' variables: instead of editing Makefiles,
|
||||
# (1) if the variable is set in 'config.status', edit 'config.status'
|
||||
# (which will cause the Makefiles to be regenerated when you run 'make');
|
||||
# (2) otherwise, pass the desired values on the 'make' command line.
|
||||
$(am__recursive_targets):
|
||||
@fail=; \
|
||||
if $(am__make_keepgoing); then \
|
||||
failcom='fail=yes'; \
|
||||
else \
|
||||
failcom='exit 1'; \
|
||||
fi; \
|
||||
dot_seen=no; \
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
case "$@" in \
|
||||
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
|
||||
*) list='$(SUBDIRS)' ;; \
|
||||
esac; \
|
||||
for subdir in $$list; do \
|
||||
echo "Making $$target in $$subdir"; \
|
||||
if test "$$subdir" = "."; then \
|
||||
dot_seen=yes; \
|
||||
local_target="$$target-am"; \
|
||||
else \
|
||||
local_target="$$target"; \
|
||||
fi; \
|
||||
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|
||||
|| eval $$failcom; \
|
||||
done; \
|
||||
if test "$$dot_seen" = "no"; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
|
||||
fi; test -z "$$fail"
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-recursive
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
|
||||
include_option=--etags-include; \
|
||||
empty_fix=.; \
|
||||
else \
|
||||
include_option=--include; \
|
||||
empty_fix=; \
|
||||
fi; \
|
||||
list='$(SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
test ! -f $$subdir/TAGS || \
|
||||
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
|
||||
fi; \
|
||||
done; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-recursive
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-recursive
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
$(am__make_dryrun) \
|
||||
|| test -d "$(distdir)/$$subdir" \
|
||||
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|
||||
|| exit 1; \
|
||||
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
|
||||
$(am__relativize); \
|
||||
new_distdir=$$reldir; \
|
||||
dir1=$$subdir; dir2="$(top_distdir)"; \
|
||||
$(am__relativize); \
|
||||
new_top_distdir=$$reldir; \
|
||||
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
|
||||
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
|
||||
($(am__cd) $$subdir && \
|
||||
$(MAKE) $(AM_MAKEFLAGS) \
|
||||
top_distdir="$$new_top_distdir" \
|
||||
distdir="$$new_distdir" \
|
||||
am__remove_distdir=: \
|
||||
am__skip_length_check=: \
|
||||
am__skip_mode_fix=: \
|
||||
distdir) \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-recursive
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs: installdirs-recursive
|
||||
installdirs-am:
|
||||
install: install-recursive
|
||||
install-exec: install-exec-recursive
|
||||
install-data: install-data-recursive
|
||||
uninstall: uninstall-recursive
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-recursive
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-recursive
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-recursive
|
||||
-rm -f ./$(DEPDIR)/buffered.Plo
|
||||
-rm -f ./$(DEPDIR)/buffered2.Plo
|
||||
-rm -f ./$(DEPDIR)/conf.Plo
|
||||
-rm -f ./$(DEPDIR)/ct-hc2c-direct.Plo
|
||||
-rm -f ./$(DEPDIR)/ct-hc2c.Plo
|
||||
-rm -f ./$(DEPDIR)/dft-r2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/dht-r2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/dht-rader.Plo
|
||||
-rm -f ./$(DEPDIR)/direct-r2c.Plo
|
||||
-rm -f ./$(DEPDIR)/direct-r2r.Plo
|
||||
-rm -f ./$(DEPDIR)/direct2.Plo
|
||||
-rm -f ./$(DEPDIR)/generic.Plo
|
||||
-rm -f ./$(DEPDIR)/hc2hc-direct.Plo
|
||||
-rm -f ./$(DEPDIR)/hc2hc-generic.Plo
|
||||
-rm -f ./$(DEPDIR)/hc2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/indirect.Plo
|
||||
-rm -f ./$(DEPDIR)/khc2c.Plo
|
||||
-rm -f ./$(DEPDIR)/khc2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/kr2c.Plo
|
||||
-rm -f ./$(DEPDIR)/kr2r.Plo
|
||||
-rm -f ./$(DEPDIR)/nop.Plo
|
||||
-rm -f ./$(DEPDIR)/nop2.Plo
|
||||
-rm -f ./$(DEPDIR)/plan.Plo
|
||||
-rm -f ./$(DEPDIR)/plan2.Plo
|
||||
-rm -f ./$(DEPDIR)/problem.Plo
|
||||
-rm -f ./$(DEPDIR)/problem2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank-geq2-rdft2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank-geq2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank0-rdft2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank0.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft-dht.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-inplace-strides.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-rdft.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-strides.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-tensor-max-index.Plo
|
||||
-rm -f ./$(DEPDIR)/solve.Plo
|
||||
-rm -f ./$(DEPDIR)/solve2.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank-geq1-rdft2.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank3-transpose.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-recursive
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-recursive
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-recursive
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-recursive
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-recursive
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-recursive
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-recursive
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-recursive
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-recursive
|
||||
-rm -f ./$(DEPDIR)/buffered.Plo
|
||||
-rm -f ./$(DEPDIR)/buffered2.Plo
|
||||
-rm -f ./$(DEPDIR)/conf.Plo
|
||||
-rm -f ./$(DEPDIR)/ct-hc2c-direct.Plo
|
||||
-rm -f ./$(DEPDIR)/ct-hc2c.Plo
|
||||
-rm -f ./$(DEPDIR)/dft-r2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/dht-r2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/dht-rader.Plo
|
||||
-rm -f ./$(DEPDIR)/direct-r2c.Plo
|
||||
-rm -f ./$(DEPDIR)/direct-r2r.Plo
|
||||
-rm -f ./$(DEPDIR)/direct2.Plo
|
||||
-rm -f ./$(DEPDIR)/generic.Plo
|
||||
-rm -f ./$(DEPDIR)/hc2hc-direct.Plo
|
||||
-rm -f ./$(DEPDIR)/hc2hc-generic.Plo
|
||||
-rm -f ./$(DEPDIR)/hc2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/indirect.Plo
|
||||
-rm -f ./$(DEPDIR)/khc2c.Plo
|
||||
-rm -f ./$(DEPDIR)/khc2hc.Plo
|
||||
-rm -f ./$(DEPDIR)/kr2c.Plo
|
||||
-rm -f ./$(DEPDIR)/kr2r.Plo
|
||||
-rm -f ./$(DEPDIR)/nop.Plo
|
||||
-rm -f ./$(DEPDIR)/nop2.Plo
|
||||
-rm -f ./$(DEPDIR)/plan.Plo
|
||||
-rm -f ./$(DEPDIR)/plan2.Plo
|
||||
-rm -f ./$(DEPDIR)/problem.Plo
|
||||
-rm -f ./$(DEPDIR)/problem2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank-geq2-rdft2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank-geq2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank0-rdft2.Plo
|
||||
-rm -f ./$(DEPDIR)/rank0.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft-dht.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-inplace-strides.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-rdft.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-strides.Plo
|
||||
-rm -f ./$(DEPDIR)/rdft2-tensor-max-index.Plo
|
||||
-rm -f ./$(DEPDIR)/solve.Plo
|
||||
-rm -f ./$(DEPDIR)/solve2.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank-geq1-rdft2.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
|
||||
-rm -f ./$(DEPDIR)/vrank3-transpose.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-recursive
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-recursive
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-recursive
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: $(am__recursive_targets) install-am install-strip
|
||||
|
||||
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
|
||||
am--depfiles check check-am clean clean-generic clean-libtool \
|
||||
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
|
||||
distclean-compile distclean-generic distclean-libtool \
|
||||
distclean-tags distdir dvi dvi-am html html-am info info-am \
|
||||
install install-am install-data install-data-am install-dvi \
|
||||
install-dvi-am install-exec install-exec-am install-html \
|
||||
install-html-am install-info install-info-am install-man \
|
||||
install-pdf install-pdf-am install-ps install-ps-am \
|
||||
install-strip installcheck installcheck-am installdirs \
|
||||
installdirs-am maintainer-clean maintainer-clean-generic \
|
||||
mostlyclean mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
|
||||
uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
337
fftw-3.3.10/rdft/buffered.c
Normal file
337
fftw-3.3.10/rdft/buffered.c
Normal file
@@ -0,0 +1,337 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
size_t maxnbuf_ndx;
|
||||
} S;
|
||||
|
||||
static const INT maxnbufs[] = { 8, 256 };
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
|
||||
plan *cld, *cldcpy, *cldrest;
|
||||
INT n, vl, nbuf, bufdist;
|
||||
INT ivs_by_nbuf, ovs_by_nbuf;
|
||||
} P;
|
||||
|
||||
/* transform a vector input with the help of bufs */
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
|
||||
plan_rdft *cldrest;
|
||||
INT i, vl = ego->vl, nbuf = ego->nbuf;
|
||||
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
|
||||
R *bufs;
|
||||
|
||||
bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* transform to bufs: */
|
||||
cld->apply((plan *) cld, I, bufs);
|
||||
I += ivs_by_nbuf;
|
||||
|
||||
/* copy back */
|
||||
cldcpy->apply((plan *) cldcpy, bufs, O);
|
||||
O += ovs_by_nbuf;
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_rdft *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, I, O);
|
||||
}
|
||||
|
||||
/* for hc2r problems, copy the input into buffer, and then
|
||||
transform buffer->output, which allows for destruction of the
|
||||
buffer */
|
||||
static void apply_hc2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
|
||||
plan_rdft *cldrest;
|
||||
INT i, vl = ego->vl, nbuf = ego->nbuf;
|
||||
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
|
||||
R *bufs;
|
||||
|
||||
bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* copy input into bufs: */
|
||||
cldcpy->apply((plan *) cldcpy, I, bufs);
|
||||
I += ivs_by_nbuf;
|
||||
|
||||
/* transform to output */
|
||||
cld->apply((plan *) cld, bufs, O);
|
||||
O += ovs_by_nbuf;
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_rdft *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, I, O);
|
||||
}
|
||||
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldcpy, wakefulness);
|
||||
X(plan_awake)(ego->cldrest, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldrest);
|
||||
X(plan_destroy_internal)(ego->cldcpy);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(rdft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
|
||||
ego->n, ego->nbuf,
|
||||
ego->vl, ego->bufdist % ego->n,
|
||||
ego->cld, ego->cldcpy, ego->cldrest);
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
iodim *d = p->sz->dims;
|
||||
|
||||
if (1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->rnk == 1
|
||||
) {
|
||||
INT vl, ivs, ovs;
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
|
||||
if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
|
||||
return 0;
|
||||
|
||||
/* if this solver is redundant, in the sense that a solver
|
||||
of lower index generates the same plan, then prune this
|
||||
solver */
|
||||
if (X(nbuf_redundant)(d[0].n, vl,
|
||||
ego->maxnbuf_ndx,
|
||||
maxnbufs, NELEM(maxnbufs)))
|
||||
return 0;
|
||||
|
||||
if (p->I != p->O) {
|
||||
if (p->kind[0] == HC2R) {
|
||||
/* Allow HC2R problems only if the input is to be
|
||||
preserved. This solver sets NO_DESTROY_INPUT,
|
||||
which prevents infinite loops */
|
||||
return (NO_DESTROY_INPUTP(plnr));
|
||||
} else {
|
||||
/*
|
||||
In principle, the buffered transforms might be useful
|
||||
when working out of place. However, in order to
|
||||
prevent infinite loops in the planner, we require
|
||||
that the output stride of the buffered transforms be
|
||||
greater than 1.
|
||||
*/
|
||||
return (d[0].os > 1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the problem is in place, the input/output strides must
|
||||
* be the same or the whole thing must fit in the buffer.
|
||||
*/
|
||||
if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
|
||||
return 1;
|
||||
|
||||
if (/* fits into buffer: */
|
||||
((p->vecsz->rnk == 0)
|
||||
||
|
||||
(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
|
||||
maxnbufs[ego->maxnbuf_ndx])
|
||||
== p->vecsz->dims[0].n)))
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_rdft *p;
|
||||
|
||||
if (NO_BUFFERINGP(plnr)) return 0;
|
||||
|
||||
if (!applicable0(ego, p_, plnr)) return 0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
if (p->kind[0] == HC2R) {
|
||||
if (NO_UGLYP(plnr)) {
|
||||
/* UGLY if in-place and too big, since the problem
|
||||
could be solved via transpositions */
|
||||
if (p->I == p->O && X(toobig)(p->sz->dims[0].n))
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
if (NO_UGLYP(plnr)) {
|
||||
if (p->I != p->O) return 0;
|
||||
if (X(toobig)(p->sz->dims[0].n)) return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
const S *ego = (const S *)ego_;
|
||||
plan *cld = (plan *) 0;
|
||||
plan *cldcpy = (plan *) 0;
|
||||
plan *cldrest = (plan *) 0;
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
R *bufs = (R *) 0;
|
||||
INT nbuf = 0, bufdist, n, vl;
|
||||
INT ivs, ovs;
|
||||
int hc2rp;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
goto nada;
|
||||
|
||||
n = X(tensor_sz)(p->sz);
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
hc2rp = (p->kind[0] == HC2R);
|
||||
|
||||
nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
|
||||
bufdist = X(bufdist)(n, vl);
|
||||
A(nbuf > 0);
|
||||
|
||||
/* initial allocation for the purpose of planning */
|
||||
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
|
||||
|
||||
if (hc2rp) {
|
||||
/* allow destruction of buffer */
|
||||
cld = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(mktensor_1d)(n, 1, p->sz->dims[0].os),
|
||||
X(mktensor_1d)(nbuf, bufdist, ovs),
|
||||
bufs, TAINT(p->O, ovs * nbuf), p->kind),
|
||||
0, 0, NO_DESTROY_INPUT);
|
||||
if (!cld) goto nada;
|
||||
|
||||
/* copying input into buffer buffer is a rank-0 transform: */
|
||||
cldcpy = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(mktensor_2d)(nbuf, ivs, bufdist,
|
||||
n, p->sz->dims[0].is, 1),
|
||||
TAINT(p->I, ivs * nbuf), bufs));
|
||||
if (!cldcpy) goto nada;
|
||||
} else {
|
||||
/* allow destruction of input if problem is in place */
|
||||
cld = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(mktensor_1d)(n, p->sz->dims[0].is, 1),
|
||||
X(mktensor_1d)(nbuf, ivs, bufdist),
|
||||
TAINT(p->I, ivs * nbuf), bufs, p->kind),
|
||||
0, 0, (p->I == p->O) ? NO_DESTROY_INPUT : 0);
|
||||
if (!cld) goto nada;
|
||||
|
||||
/* copying back from the buffer is a rank-0 transform: */
|
||||
cldcpy = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(mktensor_2d)(nbuf, bufdist, ovs,
|
||||
n, 1, p->sz->dims[0].os),
|
||||
bufs, TAINT(p->O, ovs * nbuf)));
|
||||
if (!cldcpy) goto nada;
|
||||
}
|
||||
|
||||
/* deallocate buffers, let apply() allocate them for real */
|
||||
X(ifree)(bufs);
|
||||
bufs = 0;
|
||||
|
||||
/* plan the leftover transforms (cldrest): */
|
||||
{
|
||||
INT id = ivs * (nbuf * (vl / nbuf));
|
||||
INT od = ovs * (nbuf * (vl / nbuf));
|
||||
cldrest = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(tensor_copy)(p->sz),
|
||||
X(mktensor_1d)(vl % nbuf, ivs, ovs),
|
||||
p->I + id, p->O + od, p->kind));
|
||||
}
|
||||
if (!cldrest) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, hc2rp ? apply_hc2r : apply);
|
||||
pln->cld = cld;
|
||||
pln->cldcpy = cldcpy;
|
||||
pln->cldrest = cldrest;
|
||||
pln->n = n;
|
||||
pln->vl = vl;
|
||||
pln->ivs_by_nbuf = ivs * nbuf;
|
||||
pln->ovs_by_nbuf = ovs * nbuf;
|
||||
|
||||
pln->nbuf = nbuf;
|
||||
pln->bufdist = bufdist;
|
||||
|
||||
{
|
||||
opcnt t;
|
||||
X(ops_add)(&cld->ops, &cldcpy->ops, &t);
|
||||
X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
|
||||
}
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(ifree0)(bufs);
|
||||
X(plan_destroy_internal)(cldrest);
|
||||
X(plan_destroy_internal)(cldcpy);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(size_t maxnbuf_ndx)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->maxnbuf_ndx = maxnbuf_ndx;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft_buffered_register)(planner *p)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < NELEM(maxnbufs); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(i));
|
||||
}
|
||||
375
fftw-3.3.10/rdft/buffered2.c
Normal file
375
fftw-3.3.10/rdft/buffered2.c
Normal file
@@ -0,0 +1,375 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* buffering of rdft2. We always buffer the complex array */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
size_t maxnbuf_ndx;
|
||||
} S;
|
||||
|
||||
static const INT maxnbufs[] = { 8, 256 };
|
||||
|
||||
typedef struct {
|
||||
plan_rdft2 super;
|
||||
|
||||
plan *cld, *cldcpy, *cldrest;
|
||||
INT n, vl, nbuf, bufdist;
|
||||
INT ivs_by_nbuf, ovs_by_nbuf;
|
||||
INT ioffset, roffset;
|
||||
} P;
|
||||
|
||||
/* transform a vector input with the help of bufs */
|
||||
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
|
||||
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
|
||||
INT i, vl = ego->vl, nbuf = ego->nbuf;
|
||||
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
|
||||
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
|
||||
R *bufr = bufs + ego->roffset;
|
||||
R *bufi = bufs + ego->ioffset;
|
||||
plan_rdft2 *cldrest;
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* transform to bufs: */
|
||||
cld->apply((plan *) cld, r0, r1, bufr, bufi);
|
||||
r0 += ivs_by_nbuf; r1 += ivs_by_nbuf;
|
||||
|
||||
/* copy back */
|
||||
cldcpy->apply((plan *) cldcpy, bufr, bufi, cr, ci);
|
||||
cr += ovs_by_nbuf; ci += ovs_by_nbuf;
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_rdft2 *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
|
||||
}
|
||||
|
||||
/* for hc2r problems, copy the input into buffer, and then
|
||||
transform buffer->output, which allows for destruction of the
|
||||
buffer */
|
||||
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
|
||||
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
|
||||
INT i, vl = ego->vl, nbuf = ego->nbuf;
|
||||
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
|
||||
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
|
||||
R *bufr = bufs + ego->roffset;
|
||||
R *bufi = bufs + ego->ioffset;
|
||||
plan_rdft2 *cldrest;
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* copy input into bufs: */
|
||||
cldcpy->apply((plan *) cldcpy, cr, ci, bufr, bufi);
|
||||
cr += ivs_by_nbuf; ci += ivs_by_nbuf;
|
||||
|
||||
/* transform to output */
|
||||
cld->apply((plan *) cld, r0, r1, bufr, bufi);
|
||||
r0 += ovs_by_nbuf; r1 += ovs_by_nbuf;
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_rdft2 *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
|
||||
}
|
||||
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldcpy, wakefulness);
|
||||
X(plan_awake)(ego->cldrest, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldrest);
|
||||
X(plan_destroy_internal)(ego->cldcpy);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(rdft2-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
|
||||
ego->n, ego->nbuf,
|
||||
ego->vl, ego->bufdist % ego->n,
|
||||
ego->cld, ego->cldcpy, ego->cldrest);
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
iodim *d = p->sz->dims;
|
||||
|
||||
if (1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->rnk == 1
|
||||
|
||||
/* we assume even n throughout */
|
||||
&& (d[0].n % 2) == 0
|
||||
|
||||
/* and we only consider these two cases */
|
||||
&& (p->kind == R2HC || p->kind == HC2R)
|
||||
|
||||
) {
|
||||
INT vl, ivs, ovs;
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
|
||||
if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
|
||||
return 0;
|
||||
|
||||
/* if this solver is redundant, in the sense that a solver
|
||||
of lower index generates the same plan, then prune this
|
||||
solver */
|
||||
if (X(nbuf_redundant)(d[0].n, vl,
|
||||
ego->maxnbuf_ndx,
|
||||
maxnbufs, NELEM(maxnbufs)))
|
||||
return 0;
|
||||
|
||||
if (p->r0 != p->cr) {
|
||||
if (p->kind == HC2R) {
|
||||
/* Allow HC2R problems only if the input is to be
|
||||
preserved. This solver sets NO_DESTROY_INPUT,
|
||||
which prevents infinite loops */
|
||||
return (NO_DESTROY_INPUTP(plnr));
|
||||
} else {
|
||||
/*
|
||||
In principle, the buffered transforms might be useful
|
||||
when working out of place. However, in order to
|
||||
prevent infinite loops in the planner, we require
|
||||
that the output stride of the buffered transforms be
|
||||
greater than 2.
|
||||
*/
|
||||
return (d[0].os > 2);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the problem is in place, the input/output strides must
|
||||
* be the same or the whole thing must fit in the buffer.
|
||||
*/
|
||||
if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
|
||||
return 1;
|
||||
|
||||
if (/* fits into buffer: */
|
||||
((p->vecsz->rnk == 0)
|
||||
||
|
||||
(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
|
||||
maxnbufs[ego->maxnbuf_ndx])
|
||||
== p->vecsz->dims[0].n)))
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p;
|
||||
|
||||
if (NO_BUFFERINGP(plnr)) return 0;
|
||||
|
||||
if (!applicable0(ego, p_, plnr)) return 0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
if (p->kind == HC2R) {
|
||||
if (NO_UGLYP(plnr)) {
|
||||
/* UGLY if in-place and too big, since the problem
|
||||
could be solved via transpositions */
|
||||
if (p->r0 == p->cr && X(toobig)(p->sz->dims[0].n))
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
if (NO_UGLYP(plnr)) {
|
||||
if (p->r0 != p->cr || X(toobig)(p->sz->dims[0].n))
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
const S *ego = (const S *)ego_;
|
||||
plan *cld = (plan *) 0;
|
||||
plan *cldcpy = (plan *) 0;
|
||||
plan *cldrest = (plan *) 0;
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
R *bufs = (R *) 0;
|
||||
INT nbuf = 0, bufdist, n, vl;
|
||||
INT ivs, ovs, ioffset, roffset, id, od;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
goto nada;
|
||||
|
||||
n = X(tensor_sz)(p->sz);
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
|
||||
nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
|
||||
bufdist = X(bufdist)(n + 2, vl); /* complex-side rdft2 stores N+2
|
||||
real numbers */
|
||||
A(nbuf > 0);
|
||||
|
||||
/* attempt to keep real and imaginary part in the same order,
|
||||
so as to allow optimizations in the the copy plan */
|
||||
roffset = (p->cr - p->ci > 0) ? (INT)1 : (INT)0;
|
||||
ioffset = 1 - roffset;
|
||||
|
||||
/* initial allocation for the purpose of planning */
|
||||
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
|
||||
|
||||
id = ivs * (nbuf * (vl / nbuf));
|
||||
od = ovs * (nbuf * (vl / nbuf));
|
||||
|
||||
if (p->kind == R2HC) {
|
||||
/* allow destruction of input if problem is in place */
|
||||
cld = X(mkplan_f_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft2_d)(
|
||||
X(mktensor_1d)(n, p->sz->dims[0].is, 2),
|
||||
X(mktensor_1d)(nbuf, ivs, bufdist),
|
||||
TAINT(p->r0, ivs * nbuf), TAINT(p->r1, ivs * nbuf),
|
||||
bufs + roffset, bufs + ioffset, p->kind),
|
||||
0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
|
||||
if (!cld) goto nada;
|
||||
|
||||
/* copying back from the buffer is a rank-0 DFT: */
|
||||
cldcpy = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_0d)(),
|
||||
X(mktensor_2d)(nbuf, bufdist, ovs,
|
||||
n/2+1, 2, p->sz->dims[0].os),
|
||||
bufs + roffset, bufs + ioffset,
|
||||
TAINT(p->cr, ovs * nbuf), TAINT(p->ci, ovs * nbuf) ));
|
||||
if (!cldcpy) goto nada;
|
||||
|
||||
X(ifree)(bufs); bufs = 0;
|
||||
|
||||
cldrest = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(
|
||||
X(tensor_copy)(p->sz),
|
||||
X(mktensor_1d)(vl % nbuf, ivs, ovs),
|
||||
p->r0 + id, p->r1 + id,
|
||||
p->cr + od, p->ci + od,
|
||||
p->kind));
|
||||
if (!cldrest) goto nada;
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
|
||||
} else {
|
||||
/* allow destruction of buffer */
|
||||
cld = X(mkplan_f_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft2_d)(
|
||||
X(mktensor_1d)(n, 2, p->sz->dims[0].os),
|
||||
X(mktensor_1d)(nbuf, bufdist, ovs),
|
||||
TAINT(p->r0, ovs * nbuf), TAINT(p->r1, ovs * nbuf),
|
||||
bufs + roffset, bufs + ioffset, p->kind),
|
||||
0, 0, NO_DESTROY_INPUT);
|
||||
if (!cld) goto nada;
|
||||
|
||||
/* copying input into buffer is a rank-0 DFT: */
|
||||
cldcpy = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_0d)(),
|
||||
X(mktensor_2d)(nbuf, ivs, bufdist,
|
||||
n/2+1, p->sz->dims[0].is, 2),
|
||||
TAINT(p->cr, ivs * nbuf), TAINT(p->ci, ivs * nbuf),
|
||||
bufs + roffset, bufs + ioffset));
|
||||
if (!cldcpy) goto nada;
|
||||
|
||||
X(ifree)(bufs); bufs = 0;
|
||||
|
||||
cldrest = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(
|
||||
X(tensor_copy)(p->sz),
|
||||
X(mktensor_1d)(vl % nbuf, ivs, ovs),
|
||||
p->r0 + od, p->r1 + od,
|
||||
p->cr + id, p->ci + id,
|
||||
p->kind));
|
||||
if (!cldrest) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
|
||||
}
|
||||
|
||||
pln->cld = cld;
|
||||
pln->cldcpy = cldcpy;
|
||||
pln->cldrest = cldrest;
|
||||
pln->n = n;
|
||||
pln->vl = vl;
|
||||
pln->ivs_by_nbuf = ivs * nbuf;
|
||||
pln->ovs_by_nbuf = ovs * nbuf;
|
||||
pln->roffset = roffset;
|
||||
pln->ioffset = ioffset;
|
||||
|
||||
pln->nbuf = nbuf;
|
||||
pln->bufdist = bufdist;
|
||||
|
||||
{
|
||||
opcnt t;
|
||||
X(ops_add)(&cld->ops, &cldcpy->ops, &t);
|
||||
X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
|
||||
}
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(ifree0)(bufs);
|
||||
X(plan_destroy_internal)(cldrest);
|
||||
X(plan_destroy_internal)(cldcpy);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(size_t maxnbuf_ndx)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->maxnbuf_ndx = maxnbuf_ndx;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft2_buffered_register)(planner *p)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < NELEM(maxnbufs); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(i));
|
||||
}
|
||||
172
fftw-3.3.10/rdft/codelet-rdft.h
Normal file
172
fftw-3.3.10/rdft/codelet-rdft.h
Normal file
@@ -0,0 +1,172 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* This header file must include every file or define every
|
||||
* type or macro which is required to compile a codelet.
|
||||
*/
|
||||
|
||||
#ifndef __RDFT_CODELET_H__
|
||||
#define __RDFT_CODELET_H__
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
/**************************************************************
|
||||
* types of codelets
|
||||
**************************************************************/
|
||||
|
||||
/* FOOab, with a,b in {0,1}, denotes the FOO transform
|
||||
where a/b say whether the input/output are shifted by
|
||||
half a sample/slot. */
|
||||
|
||||
typedef enum {
|
||||
R2HC00, R2HC01, R2HC10, R2HC11,
|
||||
HC2R00, HC2R01, HC2R10, HC2R11,
|
||||
DHT,
|
||||
REDFT00, REDFT01, REDFT10, REDFT11, /* real-even == DCT's */
|
||||
RODFT00, RODFT01, RODFT10, RODFT11 /* real-odd == DST's */
|
||||
} rdft_kind;
|
||||
|
||||
/* standard R2HC/HC2R transforms are unshifted */
|
||||
#define R2HC R2HC00
|
||||
#define HC2R HC2R00
|
||||
|
||||
#define R2HCII R2HC01
|
||||
#define HC2RIII HC2R10
|
||||
|
||||
/* (k) >= R2HC00 produces a warning under gcc because checking x >= 0
|
||||
is superfluous for unsigned values...but it is needed because other
|
||||
compilers (e.g. icc) may define the enum to be a signed int...grrr. */
|
||||
#define R2HC_KINDP(k) ((k) >= R2HC00 && (k) <= R2HC11) /* uses kr2hc_genus */
|
||||
#define HC2R_KINDP(k) ((k) >= HC2R00 && (k) <= HC2R11) /* uses khc2r_genus */
|
||||
|
||||
#define R2R_KINDP(k) ((k) >= DHT) /* uses kr2r_genus */
|
||||
|
||||
#define REDFT_KINDP(k) ((k) >= REDFT00 && (k) <= REDFT11)
|
||||
#define RODFT_KINDP(k) ((k) >= RODFT00 && (k) <= RODFT11)
|
||||
#define REODFT_KINDP(k) ((k) >= REDFT00 && (k) <= RODFT11)
|
||||
|
||||
/* codelets with real input (output) and complex output (input) */
|
||||
typedef struct kr2c_desc_s kr2c_desc;
|
||||
|
||||
typedef struct {
|
||||
rdft_kind kind;
|
||||
INT vl;
|
||||
} kr2c_genus;
|
||||
|
||||
struct kr2c_desc_s {
|
||||
INT n; /* size of transform computed */
|
||||
const char *nam;
|
||||
opcnt ops;
|
||||
const kr2c_genus *genus;
|
||||
};
|
||||
|
||||
typedef void (*kr2c) (R *R0, R *R1, R *Cr, R *Ci,
|
||||
stride rs, stride csr, stride csi,
|
||||
INT vl, INT ivs, INT ovs);
|
||||
void X(kr2c_register)(planner *p, kr2c codelet, const kr2c_desc *desc);
|
||||
|
||||
/* half-complex to half-complex DIT/DIF codelets: */
|
||||
typedef struct hc2hc_desc_s hc2hc_desc;
|
||||
|
||||
typedef struct {
|
||||
rdft_kind kind;
|
||||
INT vl;
|
||||
} hc2hc_genus;
|
||||
|
||||
struct hc2hc_desc_s {
|
||||
INT radix;
|
||||
const char *nam;
|
||||
const tw_instr *tw;
|
||||
const hc2hc_genus *genus;
|
||||
opcnt ops;
|
||||
};
|
||||
|
||||
typedef void (*khc2hc) (R *rioarray, R *iioarray, const R *W,
|
||||
stride rs, INT mb, INT me, INT ms);
|
||||
void X(khc2hc_register)(planner *p, khc2hc codelet, const hc2hc_desc *desc);
|
||||
|
||||
/* half-complex to rdft2-complex DIT/DIF codelets: */
|
||||
typedef struct hc2c_desc_s hc2c_desc;
|
||||
|
||||
typedef enum {
|
||||
HC2C_VIA_RDFT,
|
||||
HC2C_VIA_DFT
|
||||
} hc2c_kind;
|
||||
|
||||
typedef struct {
|
||||
int (*okp)(
|
||||
const R *Rp, const R *Ip, const R *Rm, const R *Im,
|
||||
INT rs, INT mb, INT me, INT ms,
|
||||
const planner *plnr);
|
||||
rdft_kind kind;
|
||||
INT vl;
|
||||
} hc2c_genus;
|
||||
|
||||
struct hc2c_desc_s {
|
||||
INT radix;
|
||||
const char *nam;
|
||||
const tw_instr *tw;
|
||||
const hc2c_genus *genus;
|
||||
opcnt ops;
|
||||
};
|
||||
|
||||
typedef void (*khc2c) (R *Rp, R *Ip, R *Rm, R *Im, const R *W,
|
||||
stride rs, INT mb, INT me, INT ms);
|
||||
void X(khc2c_register)(planner *p, khc2c codelet, const hc2c_desc *desc,
|
||||
hc2c_kind hc2ckind);
|
||||
|
||||
extern const solvtab X(solvtab_rdft_r2cf);
|
||||
extern const solvtab X(solvtab_rdft_r2cb);
|
||||
extern const solvtab X(solvtab_rdft_sse2);
|
||||
extern const solvtab X(solvtab_rdft_avx);
|
||||
extern const solvtab X(solvtab_rdft_avx_128_fma);
|
||||
extern const solvtab X(solvtab_rdft_avx2);
|
||||
extern const solvtab X(solvtab_rdft_avx2_128);
|
||||
extern const solvtab X(solvtab_rdft_avx512);
|
||||
extern const solvtab X(solvtab_rdft_kcvi);
|
||||
extern const solvtab X(solvtab_rdft_altivec);
|
||||
extern const solvtab X(solvtab_rdft_vsx);
|
||||
extern const solvtab X(solvtab_rdft_neon);
|
||||
extern const solvtab X(solvtab_rdft_generic_simd128);
|
||||
extern const solvtab X(solvtab_rdft_generic_simd256);
|
||||
|
||||
/* real-input & output DFT-like codelets (DHT, etc.) */
|
||||
typedef struct kr2r_desc_s kr2r_desc;
|
||||
|
||||
typedef struct {
|
||||
INT vl;
|
||||
} kr2r_genus;
|
||||
|
||||
struct kr2r_desc_s {
|
||||
INT n; /* size of transform computed */
|
||||
const char *nam;
|
||||
opcnt ops;
|
||||
const kr2r_genus *genus;
|
||||
rdft_kind kind;
|
||||
};
|
||||
|
||||
typedef void (*kr2r) (const R *I, R *O, stride is, stride os,
|
||||
INT vl, INT ivs, INT ovs);
|
||||
void X(kr2r_register)(planner *p, kr2r codelet, const kr2r_desc *desc);
|
||||
|
||||
extern const solvtab X(solvtab_rdft_r2r);
|
||||
|
||||
#endif /* __RDFT_CODELET_H__ */
|
||||
105
fftw-3.3.10/rdft/conf.c
Normal file
105
fftw-3.3.10/rdft/conf.c
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
static const solvtab s =
|
||||
{
|
||||
SOLVTAB(X(rdft_indirect_register)),
|
||||
SOLVTAB(X(rdft_rank0_register)),
|
||||
SOLVTAB(X(rdft_vrank3_transpose_register)),
|
||||
SOLVTAB(X(rdft_vrank_geq1_register)),
|
||||
|
||||
SOLVTAB(X(rdft_nop_register)),
|
||||
SOLVTAB(X(rdft_buffered_register)),
|
||||
SOLVTAB(X(rdft_generic_register)),
|
||||
SOLVTAB(X(rdft_rank_geq2_register)),
|
||||
|
||||
SOLVTAB(X(dft_r2hc_register)),
|
||||
|
||||
SOLVTAB(X(rdft_dht_register)),
|
||||
SOLVTAB(X(dht_r2hc_register)),
|
||||
SOLVTAB(X(dht_rader_register)),
|
||||
|
||||
SOLVTAB(X(rdft2_vrank_geq1_register)),
|
||||
SOLVTAB(X(rdft2_nop_register)),
|
||||
SOLVTAB(X(rdft2_rank0_register)),
|
||||
SOLVTAB(X(rdft2_buffered_register)),
|
||||
SOLVTAB(X(rdft2_rank_geq2_register)),
|
||||
SOLVTAB(X(rdft2_rdft_register)),
|
||||
|
||||
SOLVTAB(X(hc2hc_generic_register)),
|
||||
|
||||
SOLVTAB_END
|
||||
};
|
||||
|
||||
void X(rdft_conf_standard)(planner *p)
|
||||
{
|
||||
X(solvtab_exec)(s, p);
|
||||
X(solvtab_exec)(X(solvtab_rdft_r2cf), p);
|
||||
X(solvtab_exec)(X(solvtab_rdft_r2cb), p);
|
||||
X(solvtab_exec)(X(solvtab_rdft_r2r), p);
|
||||
|
||||
#if HAVE_SSE2
|
||||
if (X(have_simd_sse2)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_sse2), p);
|
||||
#endif
|
||||
#if HAVE_AVX
|
||||
if (X(have_simd_avx)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_avx), p);
|
||||
#endif
|
||||
#if HAVE_AVX_128_FMA
|
||||
if (X(have_simd_avx_128_fma)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_avx_128_fma), p);
|
||||
#endif
|
||||
#if HAVE_AVX2
|
||||
if (X(have_simd_avx2)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_avx2), p);
|
||||
if (X(have_simd_avx2_128)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_avx2_128), p);
|
||||
#endif
|
||||
#if HAVE_AVX512
|
||||
if (X(have_simd_avx512)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_avx512), p);
|
||||
#endif
|
||||
#if HAVE_KCVI
|
||||
if (X(have_simd_kcvi)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_kcvi), p);
|
||||
#endif
|
||||
#if HAVE_ALTIVEC
|
||||
if (X(have_simd_altivec)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_altivec), p);
|
||||
#endif
|
||||
#if HAVE_VSX
|
||||
if (X(have_simd_vsx)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_vsx), p);
|
||||
#endif
|
||||
#if HAVE_NEON
|
||||
if (X(have_simd_neon)())
|
||||
X(solvtab_exec)(X(solvtab_rdft_neon), p);
|
||||
#endif
|
||||
#if HAVE_GENERIC_SIMD128
|
||||
X(solvtab_exec)(X(solvtab_rdft_generic_simd128), p);
|
||||
#endif
|
||||
#if HAVE_GENERIC_SIMD256
|
||||
X(solvtab_exec)(X(solvtab_rdft_generic_simd256), p);
|
||||
#endif
|
||||
}
|
||||
404
fftw-3.3.10/rdft/ct-hc2c-direct.c
Normal file
404
fftw-3.3.10/rdft/ct-hc2c-direct.c
Normal file
@@ -0,0 +1,404 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "ct-hc2c.h"
|
||||
|
||||
typedef struct {
|
||||
hc2c_solver super;
|
||||
const hc2c_desc *desc;
|
||||
int bufferedp;
|
||||
khc2c k;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_hc2c super;
|
||||
khc2c k;
|
||||
plan *cld0, *cldm; /* children for 0th and middle butterflies */
|
||||
INT r, m, v, extra_iter;
|
||||
INT ms, vs;
|
||||
stride rs, brs;
|
||||
twid *td;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
/*************************************************************
|
||||
Nonbuffered code
|
||||
*************************************************************/
|
||||
static void apply(const plan *ego_, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
|
||||
plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
|
||||
INT i, m = ego->m, v = ego->v;
|
||||
INT ms = ego->ms, vs = ego->vs;
|
||||
|
||||
for (i = 0; i < v; ++i, cr += vs, ci += vs) {
|
||||
cld0->apply((plan *) cld0, cr, ci, cr, ci);
|
||||
ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
|
||||
ego->td->W, ego->rs, 1, (m+1)/2, ms);
|
||||
cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
|
||||
cr + (m/2)*ms, ci + (m/2)*ms);
|
||||
}
|
||||
}
|
||||
|
||||
static void apply_extra_iter(const plan *ego_, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
|
||||
plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
|
||||
INT i, m = ego->m, v = ego->v;
|
||||
INT ms = ego->ms, vs = ego->vs;
|
||||
INT mm = (m-1)/2;
|
||||
|
||||
for (i = 0; i < v; ++i, cr += vs, ci += vs) {
|
||||
cld0->apply((plan *) cld0, cr, ci, cr, ci);
|
||||
|
||||
/* for 4-way SIMD when (m+1)/2-1 is odd: iterate over an
|
||||
even vector length MM-1, and then execute the last
|
||||
iteration as a 2-vector with vector stride 0. The
|
||||
twiddle factors of the second half of the last iteration
|
||||
are bogus, but we only store the results of the first
|
||||
half. */
|
||||
ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
|
||||
ego->td->W, ego->rs, 1, mm, ms);
|
||||
ego->k(cr + mm*ms, ci + mm*ms, cr + (m-mm)*ms, ci + (m-mm)*ms,
|
||||
ego->td->W, ego->rs, mm, mm+2, 0);
|
||||
cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
|
||||
cr + (m/2)*ms, ci + (m/2)*ms);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
Buffered code
|
||||
*************************************************************/
|
||||
|
||||
/* should not be 2^k to avoid associativity conflicts */
|
||||
static INT compute_batchsize(INT radix)
|
||||
{
|
||||
/* round up to multiple of 4 */
|
||||
radix += 3;
|
||||
radix &= -4;
|
||||
|
||||
return (radix + 2);
|
||||
}
|
||||
|
||||
static void dobatch(const P *ego, R *Rp, R *Ip, R *Rm, R *Im,
|
||||
INT mb, INT me, INT extra_iter, R *bufp)
|
||||
{
|
||||
INT b = WS(ego->brs, 1);
|
||||
INT rs = WS(ego->rs, 1);
|
||||
INT ms = ego->ms;
|
||||
R *bufm = bufp + b - 2;
|
||||
INT n = me - mb;
|
||||
|
||||
X(cpy2d_pair_ci)(Rp + mb * ms, Ip + mb * ms, bufp, bufp + 1,
|
||||
ego->r / 2, rs, b,
|
||||
n, ms, 2);
|
||||
X(cpy2d_pair_ci)(Rm - mb * ms, Im - mb * ms, bufm, bufm + 1,
|
||||
ego->r / 2, rs, b,
|
||||
n, -ms, -2);
|
||||
|
||||
if (extra_iter) {
|
||||
/* initialize the extra_iter element to 0. It would be ok
|
||||
to leave it uninitialized, since we transform uninitialized
|
||||
data and ignore the result. However, we want to avoid
|
||||
FP exceptions in case somebody is trapping them. */
|
||||
A(n < compute_batchsize(ego->r));
|
||||
X(zero1d_pair)(bufp + 2*n, bufp + 1 + 2*n, ego->r / 2, b);
|
||||
X(zero1d_pair)(bufm - 2*n, bufm + 1 - 2*n, ego->r / 2, b);
|
||||
}
|
||||
|
||||
ego->k(bufp, bufp + 1, bufm, bufm + 1, ego->td->W,
|
||||
ego->brs, mb, me + extra_iter, 2);
|
||||
X(cpy2d_pair_co)(bufp, bufp + 1, Rp + mb * ms, Ip + mb * ms,
|
||||
ego->r / 2, b, rs,
|
||||
n, 2, ms);
|
||||
X(cpy2d_pair_co)(bufm, bufm + 1, Rm - mb * ms, Im - mb * ms,
|
||||
ego->r / 2, b, rs,
|
||||
n, -2, -ms);
|
||||
}
|
||||
|
||||
static void apply_buf(const plan *ego_, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
|
||||
plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
|
||||
INT i, j, ms = ego->ms, v = ego->v;
|
||||
INT batchsz = compute_batchsize(ego->r);
|
||||
R *buf;
|
||||
INT mb = 1, me = (ego->m+1) / 2;
|
||||
size_t bufsz = ego->r * batchsz * 2 * sizeof(R);
|
||||
|
||||
BUF_ALLOC(R *, buf, bufsz);
|
||||
|
||||
for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
|
||||
R *Rp = cr;
|
||||
R *Ip = ci;
|
||||
R *Rm = cr + ego->m * ms;
|
||||
R *Im = ci + ego->m * ms;
|
||||
|
||||
cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);
|
||||
|
||||
for (j = mb; j + batchsz < me; j += batchsz)
|
||||
dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);
|
||||
|
||||
dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);
|
||||
|
||||
cldm->apply((plan *) cldm,
|
||||
Rp + me * ms, Ip + me * ms,
|
||||
Rp + me * ms, Ip + me * ms);
|
||||
|
||||
}
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
common code
|
||||
*************************************************************/
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld0, wakefulness);
|
||||
X(plan_awake)(ego->cldm, wakefulness);
|
||||
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
|
||||
ego->r * ego->m, ego->r,
|
||||
(ego->m - 1) / 2 + ego->extra_iter);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld0);
|
||||
X(plan_destroy_internal)(ego->cldm);
|
||||
X(stride_destroy)(ego->rs);
|
||||
X(stride_destroy)(ego->brs);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *slv = ego->slv;
|
||||
const hc2c_desc *e = slv->desc;
|
||||
|
||||
if (slv->bufferedp)
|
||||
p->print(p, "(hc2c-directbuf/%D-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
|
||||
compute_batchsize(ego->r),
|
||||
ego->r, X(twiddle_length)(ego->r, e->tw),
|
||||
ego->extra_iter, ego->v, e->nam,
|
||||
ego->cld0, ego->cldm);
|
||||
else
|
||||
p->print(p, "(hc2c-direct-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
|
||||
ego->r, X(twiddle_length)(ego->r, e->tw),
|
||||
ego->extra_iter, ego->v, e->nam,
|
||||
ego->cld0, ego->cldm);
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego, rdft_kind kind,
|
||||
INT r, INT rs,
|
||||
INT m, INT ms,
|
||||
INT v, INT vs,
|
||||
const R *cr, const R *ci,
|
||||
const planner *plnr,
|
||||
INT *extra_iter)
|
||||
{
|
||||
const hc2c_desc *e = ego->desc;
|
||||
UNUSED(v);
|
||||
|
||||
return (
|
||||
1
|
||||
&& r == e->radix
|
||||
&& kind == e->genus->kind
|
||||
|
||||
/* first v-loop iteration */
|
||||
&& ((*extra_iter = 0,
|
||||
e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
|
||||
rs, 1, (m+1)/2, ms, plnr))
|
||||
||
|
||||
(*extra_iter = 1,
|
||||
((e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
|
||||
rs, 1, (m-1)/2, ms, plnr))
|
||||
&&
|
||||
(e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
|
||||
rs, (m-1)/2, (m-1)/2 + 2, 0, plnr)))))
|
||||
|
||||
/* subsequent v-loop iterations */
|
||||
&& (cr += vs, ci += vs, 1)
|
||||
|
||||
&& e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
|
||||
rs, 1, (m+1)/2 - *extra_iter, ms, plnr)
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable0_buf(const S *ego, rdft_kind kind,
|
||||
INT r, INT rs,
|
||||
INT m, INT ms,
|
||||
INT v, INT vs,
|
||||
const R *cr, const R *ci,
|
||||
const planner *plnr, INT *extra_iter)
|
||||
{
|
||||
const hc2c_desc *e = ego->desc;
|
||||
INT batchsz, brs;
|
||||
UNUSED(v); UNUSED(rs); UNUSED(ms); UNUSED(vs);
|
||||
|
||||
return (
|
||||
1
|
||||
&& r == e->radix
|
||||
&& kind == e->genus->kind
|
||||
|
||||
/* ignore cr, ci, use buffer */
|
||||
&& (cr = (const R *)0, ci = cr + 1,
|
||||
batchsz = compute_batchsize(r),
|
||||
brs = 4 * batchsz, 1)
|
||||
|
||||
&& e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
|
||||
brs, 1, 1+batchsz, 2, plnr)
|
||||
|
||||
&& ((*extra_iter = 0,
|
||||
e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
|
||||
brs, 1, 1 + (((m-1)/2) % batchsz), 2, plnr))
|
||||
||
|
||||
(*extra_iter = 1,
|
||||
e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
|
||||
brs, 1, 1 + 1 + (((m-1)/2) % batchsz), 2, plnr)))
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, rdft_kind kind,
|
||||
INT r, INT rs,
|
||||
INT m, INT ms,
|
||||
INT v, INT vs,
|
||||
R *cr, R *ci,
|
||||
const planner *plnr, INT *extra_iter)
|
||||
{
|
||||
if (ego->bufferedp) {
|
||||
if (!applicable0_buf(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
|
||||
extra_iter))
|
||||
return 0;
|
||||
} else {
|
||||
if (!applicable0(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
|
||||
extra_iter))
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
|
||||
v, m * r, r))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkcldw(const hc2c_solver *ego_, rdft_kind kind,
|
||||
INT r, INT rs,
|
||||
INT m, INT ms,
|
||||
INT v, INT vs,
|
||||
R *cr, R *ci,
|
||||
planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const hc2c_desc *e = ego->desc;
|
||||
plan *cld0 = 0, *cldm = 0;
|
||||
INT imid = (m / 2) * ms;
|
||||
INT extra_iter;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
|
||||
&extra_iter))
|
||||
return (plan *)0;
|
||||
|
||||
cld0 = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft2_d)(X(mktensor_1d)(r, rs, rs),
|
||||
X(mktensor_0d)(),
|
||||
TAINT(cr, vs), TAINT(ci, vs),
|
||||
TAINT(cr, vs), TAINT(ci, vs),
|
||||
kind));
|
||||
if (!cld0) goto nada;
|
||||
|
||||
cldm = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft2_d)(((m % 2) ?
|
||||
X(mktensor_0d)() : X(mktensor_1d)(r, rs, rs) ),
|
||||
X(mktensor_0d)(),
|
||||
TAINT(cr + imid, vs), TAINT(ci + imid, vs),
|
||||
TAINT(cr + imid, vs), TAINT(ci + imid, vs),
|
||||
kind == R2HC ? R2HCII : HC2RIII));
|
||||
if (!cldm) goto nada;
|
||||
|
||||
if (ego->bufferedp)
|
||||
pln = MKPLAN_HC2C(P, &padt, apply_buf);
|
||||
else
|
||||
pln = MKPLAN_HC2C(P, &padt, extra_iter ? apply_extra_iter : apply);
|
||||
|
||||
pln->k = ego->k;
|
||||
pln->td = 0;
|
||||
pln->r = r; pln->rs = X(mkstride)(r, rs);
|
||||
pln->m = m; pln->ms = ms;
|
||||
pln->v = v; pln->vs = vs;
|
||||
pln->slv = ego;
|
||||
pln->brs = X(mkstride)(r, 4 * compute_batchsize(r));
|
||||
pln->cld0 = cld0;
|
||||
pln->cldm = cldm;
|
||||
pln->extra_iter = extra_iter;
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(v * (((m - 1) / 2) / e->genus->vl),
|
||||
&e->ops, &pln->super.super.ops);
|
||||
X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
|
||||
X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
|
||||
|
||||
if (ego->bufferedp)
|
||||
pln->super.super.ops.other += 4 * r * m * v;
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld0);
|
||||
X(plan_destroy_internal)(cldm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void regone(planner *plnr, khc2c codelet,
|
||||
const hc2c_desc *desc,
|
||||
hc2c_kind hc2ckind,
|
||||
int bufferedp)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_hc2c)(sizeof(S), desc->radix, hc2ckind, mkcldw);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
}
|
||||
|
||||
void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
|
||||
const hc2c_desc *desc,
|
||||
hc2c_kind hc2ckind)
|
||||
{
|
||||
regone(plnr, codelet, desc, hc2ckind, /* bufferedp */0);
|
||||
regone(plnr, codelet, desc, hc2ckind, /* bufferedp */1);
|
||||
}
|
||||
296
fftw-3.3.10/rdft/ct-hc2c.c
Normal file
296
fftw-3.3.10/rdft/ct-hc2c.c
Normal file
@@ -0,0 +1,296 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ct-hc2c.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
plan_rdft2 super;
|
||||
plan *cld;
|
||||
plan *cldw;
|
||||
INT r;
|
||||
} P;
|
||||
|
||||
static void apply_dit(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld;
|
||||
plan_hc2c *cldw;
|
||||
UNUSED(r1);
|
||||
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, r0, cr);
|
||||
|
||||
cldw = (plan_hc2c *) ego->cldw;
|
||||
cldw->apply(ego->cldw, cr, ci);
|
||||
}
|
||||
|
||||
static void apply_dif(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld;
|
||||
plan_hc2c *cldw;
|
||||
UNUSED(r1);
|
||||
|
||||
cldw = (plan_hc2c *) ego->cldw;
|
||||
cldw->apply(ego->cldw, cr, ci);
|
||||
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, cr, r0);
|
||||
}
|
||||
|
||||
static void apply_dit_dft(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
plan_hc2c *cldw;
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, r0, r1, cr, ci);
|
||||
|
||||
cldw = (plan_hc2c *) ego->cldw;
|
||||
cldw->apply(ego->cldw, cr, ci);
|
||||
}
|
||||
|
||||
static void apply_dif_dft(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
plan_hc2c *cldw;
|
||||
|
||||
cldw = (plan_hc2c *) ego->cldw;
|
||||
cldw->apply(ego->cldw, cr, ci);
|
||||
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, ci, cr, r1, r0);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldw, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldw);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(rdft2-ct-%s/%D%(%p%)%(%p%))",
|
||||
(ego->super.apply == apply_dit ||
|
||||
ego->super.apply == apply_dit_dft)
|
||||
? "dit" : "dif",
|
||||
ego->r, ego->cldw, ego->cld);
|
||||
}
|
||||
|
||||
static int applicable0(const hc2c_solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
INT r;
|
||||
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
|
||||
&& (/* either the problem is R2HC, which is solved by DIT */
|
||||
(p->kind == R2HC)
|
||||
||
|
||||
/* or the problem is HC2R, in which case it is solved
|
||||
by DIF, which destroys the input */
|
||||
(p->kind == HC2R &&
|
||||
(p->r0 == p->cr || !NO_DESTROY_INPUTP(plnr))))
|
||||
|
||||
&& ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 0)
|
||||
&& p->sz->dims[0].n > r);
|
||||
}
|
||||
|
||||
static int hc2c_applicable(const hc2c_solver *ego, const problem *p_,
|
||||
planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p;
|
||||
|
||||
if (!applicable0(ego, p_, plnr))
|
||||
return 0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
|
||||
return (0
|
||||
|| p->vecsz->rnk == 0
|
||||
|| !NO_VRECURSEP(plnr)
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const hc2c_solver *ego = (const hc2c_solver *) ego_;
|
||||
const problem_rdft2 *p;
|
||||
P *pln = 0;
|
||||
plan *cld = 0, *cldw = 0;
|
||||
INT n, r, m, v, ivs, ovs;
|
||||
iodim *d;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!hc2c_applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
d = p->sz->dims;
|
||||
n = d[0].n;
|
||||
r = X(choose_radix)(ego->r, n);
|
||||
A((r % 2) == 0);
|
||||
m = n / r;
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
|
||||
|
||||
switch (p->kind) {
|
||||
case R2HC:
|
||||
cldw = ego->mkcldw(ego, R2HC,
|
||||
r, m * d[0].os,
|
||||
m, d[0].os,
|
||||
v, ovs,
|
||||
p->cr, p->ci, plnr);
|
||||
if (!cldw) goto nada;
|
||||
|
||||
switch (ego->hc2ckind) {
|
||||
case HC2C_VIA_RDFT:
|
||||
cld = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft_1_d)(
|
||||
X(mktensor_1d)(m, (r/2)*d[0].is, d[0].os),
|
||||
X(mktensor_3d)(
|
||||
2, p->r1 - p->r0, p->ci - p->cr,
|
||||
r / 2, d[0].is, m * d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->r0, p->cr, R2HC)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_dit);
|
||||
break;
|
||||
|
||||
case HC2C_VIA_DFT:
|
||||
cld = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(m, (r/2)*d[0].is, d[0].os),
|
||||
X(mktensor_2d)(
|
||||
r / 2, d[0].is, m * d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->r0, p->r1, p->cr, p->ci)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_dit_dft);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case HC2R:
|
||||
cldw = ego->mkcldw(ego, HC2R,
|
||||
r, m * d[0].is,
|
||||
m, d[0].is,
|
||||
v, ivs,
|
||||
p->cr, p->ci, plnr);
|
||||
if (!cldw) goto nada;
|
||||
|
||||
switch (ego->hc2ckind) {
|
||||
case HC2C_VIA_RDFT:
|
||||
cld = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft_1_d)(
|
||||
X(mktensor_1d)(m, d[0].is, (r/2)*d[0].os),
|
||||
X(mktensor_3d)(
|
||||
2, p->ci - p->cr, p->r1 - p->r0,
|
||||
r / 2, m * d[0].is, d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->cr, p->r0, HC2R)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_dif);
|
||||
break;
|
||||
|
||||
case HC2C_VIA_DFT:
|
||||
cld = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_dft_d)(
|
||||
X(mktensor_1d)(m, d[0].is, (r/2)*d[0].os),
|
||||
X(mktensor_2d)(
|
||||
r / 2, m * d[0].is, d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->ci, p->cr, p->r1, p->r0)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_dif_dft);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
A(0);
|
||||
}
|
||||
|
||||
pln->cld = cld;
|
||||
pln->cldw = cldw;
|
||||
pln->r = r;
|
||||
X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
|
||||
|
||||
/* inherit could_prune_now_p attribute from cldw */
|
||||
pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldw);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
hc2c_solver *X(mksolver_hc2c)(size_t size, INT r,
|
||||
hc2c_kind hc2ckind,
|
||||
hc2c_mkinferior mkcldw)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
hc2c_solver *slv = (hc2c_solver *)X(mksolver)(size, &sadt);
|
||||
slv->r = r;
|
||||
slv->hc2ckind = hc2ckind;
|
||||
slv->mkcldw = mkcldw;
|
||||
return slv;
|
||||
}
|
||||
|
||||
plan *X(mkplan_hc2c)(size_t size, const plan_adt *adt, hc2capply apply)
|
||||
{
|
||||
plan_hc2c *ego;
|
||||
|
||||
ego = (plan_hc2c *) X(mkplan)(size, adt);
|
||||
ego->apply = apply;
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
57
fftw-3.3.10/rdft/ct-hc2c.h
Normal file
57
fftw-3.3.10/rdft/ct-hc2c.h
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef void (*hc2capply) (const plan *ego, R *cr, R *ci);
|
||||
typedef struct hc2c_solver_s hc2c_solver;
|
||||
typedef plan *(*hc2c_mkinferior)(const hc2c_solver *ego, rdft_kind kind,
|
||||
INT r, INT rs,
|
||||
INT m, INT ms,
|
||||
INT v, INT vs,
|
||||
R *cr, R *ci,
|
||||
planner *plnr);
|
||||
|
||||
typedef struct {
|
||||
plan super;
|
||||
hc2capply apply;
|
||||
} plan_hc2c;
|
||||
|
||||
extern plan *X(mkplan_hc2c)(size_t size, const plan_adt *adt,
|
||||
hc2capply apply);
|
||||
|
||||
#define MKPLAN_HC2C(type, adt, apply) \
|
||||
(type *)X(mkplan_hc2c)(sizeof(type), adt, apply)
|
||||
|
||||
struct hc2c_solver_s {
|
||||
solver super;
|
||||
INT r;
|
||||
|
||||
hc2c_mkinferior mkcldw;
|
||||
hc2c_kind hc2ckind;
|
||||
};
|
||||
|
||||
hc2c_solver *X(mksolver_hc2c)(size_t size, INT r,
|
||||
hc2c_kind hc2ckind,
|
||||
hc2c_mkinferior mkcldw);
|
||||
|
||||
void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
|
||||
const hc2c_desc *desc,
|
||||
hc2c_kind hc2ckind);
|
||||
194
fftw-3.3.10/rdft/dft-r2hc.c
Normal file
194
fftw-3.3.10/rdft/dft-r2hc.c
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* Compute the complex DFT by combining R2HC RDFTs on the real
|
||||
and imaginary parts. This could be useful for people just wanting
|
||||
to link to the real codelets and not the complex ones. It could
|
||||
also even be faster than the complex algorithms for split (as opposed
|
||||
to interleaved) real/imag complex data. */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
plan *cld;
|
||||
INT ishift, oshift;
|
||||
INT os;
|
||||
INT n;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT n;
|
||||
|
||||
UNUSED(ii);
|
||||
|
||||
{ /* transform vector of real & imag parts: */
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply((plan *) cld, ri + ego->ishift, ro + ego->oshift);
|
||||
}
|
||||
|
||||
n = ego->n;
|
||||
if (n > 1) {
|
||||
INT i, os = ego->os;
|
||||
for (i = 1; i < (n + 1)/2; ++i) {
|
||||
E rop, iop, iom, rom;
|
||||
rop = ro[os * i];
|
||||
iop = io[os * i];
|
||||
rom = ro[os * (n - i)];
|
||||
iom = io[os * (n - i)];
|
||||
ro[os * i] = rop - iom;
|
||||
io[os * i] = iop + rom;
|
||||
ro[os * (n - i)] = rop + iom;
|
||||
io[os * (n - i)] = iop - rom;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(dft-r2hc-%D%(%p%))", ego->n, ego->cld);
|
||||
}
|
||||
|
||||
|
||||
static int applicable0(const problem *p_)
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
return ((p->sz->rnk == 1 && p->vecsz->rnk == 0)
|
||||
|| (p->sz->rnk == 0 && FINITE_RNK(p->vecsz->rnk))
|
||||
);
|
||||
}
|
||||
|
||||
static int splitp(R *r, R *i, INT n, INT s)
|
||||
{
|
||||
return ((r > i ? (r - i) : (i - r)) >= n * (s > 0 ? s : 0-s));
|
||||
}
|
||||
|
||||
static int applicable(const problem *p_, const planner *plnr)
|
||||
{
|
||||
if (!applicable0(p_)) return 0;
|
||||
|
||||
{
|
||||
const problem_dft *p = (const problem_dft *) p_;
|
||||
|
||||
/* rank-0 problems are always OK */
|
||||
if (p->sz->rnk == 0) return 1;
|
||||
|
||||
/* this solver is ok for split arrays */
|
||||
if (p->sz->rnk == 1 &&
|
||||
splitp(p->ri, p->ii, p->sz->dims[0].n, p->sz->dims[0].is) &&
|
||||
splitp(p->ro, p->io, p->sz->dims[0].n, p->sz->dims[0].os))
|
||||
return 1;
|
||||
|
||||
return !(NO_DFT_R2HCP(plnr));
|
||||
}
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
const problem_dft *p;
|
||||
plan *cld;
|
||||
INT ishift = 0, oshift = 0;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego_);
|
||||
if (!applicable(p_, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
p = (const problem_dft *) p_;
|
||||
|
||||
{
|
||||
tensor *ri_vec = X(mktensor_1d)(2, p->ii - p->ri, p->io - p->ro);
|
||||
tensor *cld_vec = X(tensor_append)(ri_vec, p->vecsz);
|
||||
int i;
|
||||
for (i = 0; i < cld_vec->rnk; ++i) { /* make all istrides > 0 */
|
||||
if (cld_vec->dims[i].is < 0) {
|
||||
INT nm1 = cld_vec->dims[i].n - 1;
|
||||
ishift -= nm1 * (cld_vec->dims[i].is *= -1);
|
||||
oshift -= nm1 * (cld_vec->dims[i].os *= -1);
|
||||
}
|
||||
}
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_1)(p->sz, cld_vec,
|
||||
p->ri + ishift,
|
||||
p->ro + oshift, R2HC));
|
||||
X(tensor_destroy2)(ri_vec, cld_vec);
|
||||
}
|
||||
if (!cld) return (plan *)0;
|
||||
|
||||
pln = MKPLAN_DFT(P, &padt, apply);
|
||||
|
||||
if (p->sz->rnk == 0) {
|
||||
pln->n = 1;
|
||||
pln->os = 0;
|
||||
}
|
||||
else {
|
||||
pln->n = p->sz->dims[0].n;
|
||||
pln->os = p->sz->dims[0].os;
|
||||
}
|
||||
pln->ishift = ishift;
|
||||
pln->oshift = oshift;
|
||||
|
||||
pln->cld = cld;
|
||||
|
||||
pln->super.super.ops = cld->ops;
|
||||
pln->super.super.ops.other += 8 * ((pln->n - 1)/2);
|
||||
pln->super.super.ops.add += 4 * ((pln->n - 1)/2);
|
||||
pln->super.super.ops.other += 1; /* estimator hack for nop plans */
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dft_r2hc_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
144
fftw-3.3.10/rdft/dht-r2hc.c
Normal file
144
fftw-3.3.10/rdft/dht-r2hc.c
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* Solve a DHT problem (Discrete Hartley Transform) via post-processing
|
||||
of an R2HC problem. */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
plan *cld;
|
||||
INT os;
|
||||
INT n;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT os = ego->os;
|
||||
INT i, n = ego->n;
|
||||
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply((plan *) cld, I, O);
|
||||
}
|
||||
|
||||
for (i = 1; i < n - i; ++i) {
|
||||
E a, b;
|
||||
a = O[os * i];
|
||||
b = O[os * (n - i)];
|
||||
#if FFT_SIGN == -1
|
||||
O[os * i] = a - b;
|
||||
O[os * (n - i)] = a + b;
|
||||
#else
|
||||
O[os * i] = a + b;
|
||||
O[os * (n - i)] = a - b;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(dht-r2hc-%D%(%p%))", ego->n, ego->cld);
|
||||
}
|
||||
|
||||
static int applicable0(const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
return (1
|
||||
&& !NO_DHT_R2HCP(plnr)
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
&& p->kind[0] == DHT
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego, const problem *p, const planner *plnr)
|
||||
{
|
||||
UNUSED(ego);
|
||||
return (!NO_SLOWP(plnr) && applicable0(p, plnr));
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
const problem_rdft *p;
|
||||
plan *cld;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
|
||||
/* NO_DHT_R2HC stops infinite loops with rdft-dht.c */
|
||||
cld = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_1)(p->sz, p->vecsz,
|
||||
p->I, p->O, R2HC),
|
||||
NO_DHT_R2HC, 0, 0);
|
||||
if (!cld) return (plan *)0;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, apply);
|
||||
|
||||
pln->n = p->sz->dims[0].n;
|
||||
pln->os = p->sz->dims[0].os;
|
||||
pln->cld = cld;
|
||||
|
||||
pln->super.super.ops = cld->ops;
|
||||
pln->super.super.ops.other += 4 * ((pln->n - 1)/2);
|
||||
pln->super.super.ops.add += 2 * ((pln->n - 1)/2);
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dht_r2hc_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
386
fftw-3.3.10/rdft/dht-rader.c
Normal file
386
fftw-3.3.10/rdft/dht-rader.c
Normal file
@@ -0,0 +1,386 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
/*
|
||||
* Compute DHTs of prime sizes using Rader's trick: turn them
|
||||
* into convolutions of size n - 1, which we then perform via a pair
|
||||
* of FFTs. (We can then do prime real FFTs via rdft-dht.c.)
|
||||
*
|
||||
* Optionally (determined by the "pad" field of the solver), we can
|
||||
* perform the (cyclic) convolution by zero-padding to a size
|
||||
* >= 2*(n-1) - 1. This is advantageous if n-1 has large prime factors.
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int pad;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
R *omega;
|
||||
INT n, npad, g, ginv;
|
||||
INT is, os;
|
||||
plan *cld_omega;
|
||||
} P;
|
||||
|
||||
static rader_tl *omegas = 0;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
/* If R2HC_ONLY_CONV is 1, we use a trick to perform the convolution
|
||||
purely in terms of R2HC transforms, as opposed to R2HC followed by H2RC.
|
||||
This requires a few more operations, but allows us to share the same
|
||||
plan/codelets for both Rader children. */
|
||||
#define R2HC_ONLY_CONV 1
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT n = ego->n; /* prime */
|
||||
INT npad = ego->npad; /* == n - 1 for unpadded Rader; always even */
|
||||
INT is = ego->is, os;
|
||||
INT k, gpower, g;
|
||||
R *buf, *omega;
|
||||
R r0;
|
||||
|
||||
buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
|
||||
|
||||
/* First, permute the input, storing in buf: */
|
||||
g = ego->g;
|
||||
for (gpower = 1, k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
|
||||
buf[k] = I[gpower * is];
|
||||
}
|
||||
/* gpower == g^(n-1) mod n == 1 */;
|
||||
|
||||
A(n - 1 <= npad);
|
||||
for (k = n - 1; k < npad; ++k) /* optionally, zero-pad convolution */
|
||||
buf[k] = 0;
|
||||
|
||||
os = ego->os;
|
||||
|
||||
/* compute RDFT of buf, storing in buf (i.e., in-place): */
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld1;
|
||||
cld->apply((plan *) cld, buf, buf);
|
||||
}
|
||||
|
||||
/* set output DC component: */
|
||||
O[0] = (r0 = I[0]) + buf[0];
|
||||
|
||||
/* now, multiply by omega: */
|
||||
omega = ego->omega;
|
||||
buf[0] *= omega[0];
|
||||
for (k = 1; k < npad/2; ++k) {
|
||||
E rB, iB, rW, iW, a, b;
|
||||
rW = omega[k];
|
||||
iW = omega[npad - k];
|
||||
rB = buf[k];
|
||||
iB = buf[npad - k];
|
||||
a = rW * rB - iW * iB;
|
||||
b = rW * iB + iW * rB;
|
||||
#if R2HC_ONLY_CONV
|
||||
buf[k] = a + b;
|
||||
buf[npad - k] = a - b;
|
||||
#else
|
||||
buf[k] = a;
|
||||
buf[npad - k] = b;
|
||||
#endif
|
||||
}
|
||||
/* Nyquist component: */
|
||||
A(k + k == npad); /* since npad is even */
|
||||
buf[k] *= omega[k];
|
||||
|
||||
/* this will add input[0] to all of the outputs after the ifft */
|
||||
buf[0] += r0;
|
||||
|
||||
/* inverse FFT: */
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld2;
|
||||
cld->apply((plan *) cld, buf, buf);
|
||||
}
|
||||
|
||||
/* do inverse permutation to unshuffle the output: */
|
||||
A(gpower == 1);
|
||||
#if R2HC_ONLY_CONV
|
||||
O[os] = buf[0];
|
||||
gpower = g = ego->ginv;
|
||||
A(npad == n - 1 || npad/2 >= n - 1);
|
||||
if (npad == n - 1) {
|
||||
for (k = 1; k < npad/2; ++k, gpower = MULMOD(gpower, g, n)) {
|
||||
O[gpower * os] = buf[k] + buf[npad - k];
|
||||
}
|
||||
O[gpower * os] = buf[k];
|
||||
++k, gpower = MULMOD(gpower, g, n);
|
||||
for (; k < npad; ++k, gpower = MULMOD(gpower, g, n)) {
|
||||
O[gpower * os] = buf[npad - k] - buf[k];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (k = 1; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
|
||||
O[gpower * os] = buf[k] + buf[npad - k];
|
||||
}
|
||||
}
|
||||
#else
|
||||
g = ego->ginv;
|
||||
for (k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
|
||||
O[gpower * os] = buf[k];
|
||||
}
|
||||
#endif
|
||||
A(gpower == 1);
|
||||
|
||||
X(ifree)(buf);
|
||||
}
|
||||
|
||||
static R *mkomega(enum wakefulness wakefulness,
|
||||
plan *p_, INT n, INT npad, INT ginv)
|
||||
{
|
||||
plan_rdft *p = (plan_rdft *) p_;
|
||||
R *omega;
|
||||
INT i, gpower;
|
||||
trigreal scale;
|
||||
triggen *t;
|
||||
|
||||
if ((omega = X(rader_tl_find)(n, npad + 1, ginv, omegas)))
|
||||
return omega;
|
||||
|
||||
omega = (R *)MALLOC(sizeof(R) * npad, TWIDDLES);
|
||||
|
||||
scale = npad; /* normalization for convolution */
|
||||
|
||||
t = X(mktriggen)(wakefulness, n);
|
||||
for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
|
||||
trigreal w[2];
|
||||
t->cexpl(t, gpower, w);
|
||||
omega[i] = (w[0] + w[1]) / scale;
|
||||
}
|
||||
X(triggen_destroy)(t);
|
||||
A(gpower == 1);
|
||||
|
||||
A(npad == n - 1 || npad >= 2*(n - 1) - 1);
|
||||
|
||||
for (; i < npad; ++i)
|
||||
omega[i] = K(0.0);
|
||||
if (npad > n - 1)
|
||||
for (i = 1; i < n-1; ++i)
|
||||
omega[npad - i] = omega[n - 1 - i];
|
||||
|
||||
p->apply(p_, omega, omega);
|
||||
|
||||
X(rader_tl_insert)(n, npad + 1, ginv, omega, &omegas);
|
||||
return omega;
|
||||
}
|
||||
|
||||
static void free_omega(R *omega)
|
||||
{
|
||||
X(rader_tl_delete)(omega, &omegas);
|
||||
}
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
X(plan_awake)(ego->cld_omega, wakefulness);
|
||||
|
||||
switch (wakefulness) {
|
||||
case SLEEPY:
|
||||
free_omega(ego->omega);
|
||||
ego->omega = 0;
|
||||
break;
|
||||
default:
|
||||
ego->g = X(find_generator)(ego->n);
|
||||
ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
|
||||
A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
|
||||
|
||||
A(!ego->omega);
|
||||
ego->omega = mkomega(wakefulness,
|
||||
ego->cld_omega,ego->n,ego->npad,ego->ginv);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld_omega);
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
p->print(p, "(dht-rader-%D/%D%ois=%oos=%(%p%)",
|
||||
ego->n, ego->npad, ego->is, ego->os, ego->cld1);
|
||||
if (ego->cld2 != ego->cld1)
|
||||
p->print(p, "%(%p%)", ego->cld2);
|
||||
if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
|
||||
p->print(p, "%(%p%)", ego->cld_omega);
|
||||
p->putchr(p, ')');
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego, const problem *p_, const planner *plnr)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
UNUSED(ego);
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
&& p->kind[0] == DHT
|
||||
&& X(is_prime)(p->sz->dims[0].n)
|
||||
&& p->sz->dims[0].n > 2
|
||||
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
|
||||
/* proclaim the solver SLOW if p-1 is not easily
|
||||
factorizable. Unlike in the complex case where
|
||||
Bluestein can solve the problem, in the DHT case we
|
||||
may have no other choice */
|
||||
&& CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
|
||||
);
|
||||
}
|
||||
|
||||
static INT choose_transform_size(INT minsz)
|
||||
{
|
||||
static const INT primes[] = { 2, 3, 5, 0 };
|
||||
while (!X(factors_into)(minsz, primes) || minsz % 2)
|
||||
++minsz;
|
||||
return minsz;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
P *pln;
|
||||
INT n, npad;
|
||||
INT is, os;
|
||||
plan *cld1 = (plan *) 0;
|
||||
plan *cld2 = (plan *) 0;
|
||||
plan *cld_omega = (plan *) 0;
|
||||
R *buf = (R *) 0;
|
||||
problem *cldp;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
n = p->sz->dims[0].n;
|
||||
is = p->sz->dims[0].is;
|
||||
os = p->sz->dims[0].os;
|
||||
|
||||
if (ego->pad)
|
||||
npad = choose_transform_size(2 * (n - 1) - 1);
|
||||
else
|
||||
npad = n - 1;
|
||||
|
||||
/* initial allocation for the purpose of planning */
|
||||
buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
|
||||
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_1_d)(X(mktensor_1d)(npad, 1, 1),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
buf, buf,
|
||||
R2HC),
|
||||
NO_SLOW, 0, 0);
|
||||
if (!cld1) goto nada;
|
||||
|
||||
cldp =
|
||||
X(mkproblem_rdft_1_d)(
|
||||
X(mktensor_1d)(npad, 1, 1),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
buf, buf,
|
||||
#if R2HC_ONLY_CONV
|
||||
R2HC
|
||||
#else
|
||||
HC2R
|
||||
#endif
|
||||
);
|
||||
if (!(cld2 = X(mkplan_f_d)(plnr, cldp, NO_SLOW, 0, 0)))
|
||||
goto nada;
|
||||
|
||||
/* plan for omega */
|
||||
cld_omega = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_1_d)(
|
||||
X(mktensor_1d)(npad, 1, 1),
|
||||
X(mktensor_1d)(1, 0, 0),
|
||||
buf, buf, R2HC),
|
||||
NO_SLOW, ESTIMATE, 0);
|
||||
if (!cld_omega) goto nada;
|
||||
|
||||
/* deallocate buffers; let awake() or apply() allocate them for real */
|
||||
X(ifree)(buf);
|
||||
buf = 0;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, apply);
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->cld_omega = cld_omega;
|
||||
pln->omega = 0;
|
||||
pln->n = n;
|
||||
pln->npad = npad;
|
||||
pln->is = is;
|
||||
pln->os = os;
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
pln->super.super.ops.other += (npad/2-1)*6 + npad + n + (n-1) * ego->pad;
|
||||
pln->super.super.ops.add += (npad/2-1)*2 + 2 + (n-1) * ego->pad;
|
||||
pln->super.super.ops.mul += (npad/2-1)*4 + 2 + ego->pad;
|
||||
#if R2HC_ONLY_CONV
|
||||
pln->super.super.ops.other += n-2 - ego->pad;
|
||||
pln->super.super.ops.add += (npad/2-1)*2 + (n-2) - ego->pad;
|
||||
#endif
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(ifree0)(buf);
|
||||
X(plan_destroy_internal)(cld_omega);
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* constructors */
|
||||
|
||||
static solver *mksolver(int pad)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->pad = pad;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(dht_rader_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver(0));
|
||||
REGISTER_SOLVER(p, mksolver(1));
|
||||
}
|
||||
341
fftw-3.3.10/rdft/direct-r2c.c
Normal file
341
fftw-3.3.10/rdft/direct-r2c.c
Normal file
@@ -0,0 +1,341 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* direct RDFT solver, using r2c codelets */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
const kr2c_desc *desc;
|
||||
kr2c k;
|
||||
int bufferedp;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
|
||||
stride rs, csr, csi;
|
||||
stride brs, bcsr, bcsi;
|
||||
INT n, vl, rs0, ivs, ovs, ioffset, bioffset;
|
||||
kr2c k;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
/*************************************************************
|
||||
Nonbuffered code
|
||||
*************************************************************/
|
||||
static void apply_r2hc(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
ego->k(I, I + ego->rs0, O, O + ego->ioffset,
|
||||
ego->rs, ego->csr, ego->csi,
|
||||
ego->vl, ego->ivs, ego->ovs);
|
||||
}
|
||||
|
||||
static void apply_hc2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
ego->k(O, O + ego->rs0, I, I + ego->ioffset,
|
||||
ego->rs, ego->csr, ego->csi,
|
||||
ego->vl, ego->ivs, ego->ovs);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
Buffered code
|
||||
*************************************************************/
|
||||
/* should not be 2^k to avoid associativity conflicts */
|
||||
static INT compute_batchsize(INT radix)
|
||||
{
|
||||
/* round up to multiple of 4 */
|
||||
radix += 3;
|
||||
radix &= -4;
|
||||
|
||||
return (radix + 2);
|
||||
}
|
||||
|
||||
static void dobatch_r2hc(const P *ego, R *I, R *O, R *buf, INT batchsz)
|
||||
{
|
||||
X(cpy2d_ci)(I, buf,
|
||||
ego->n, ego->rs0, WS(ego->bcsr /* hack */, 1),
|
||||
batchsz, ego->ivs, 1, 1);
|
||||
|
||||
if (IABS(WS(ego->csr, 1)) < IABS(ego->ovs)) {
|
||||
/* transform directly to output */
|
||||
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
|
||||
O, O + ego->ioffset,
|
||||
ego->brs, ego->csr, ego->csi,
|
||||
batchsz, 1, ego->ovs);
|
||||
} else {
|
||||
/* transform to buffer and copy back */
|
||||
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
|
||||
buf, buf + ego->bioffset,
|
||||
ego->brs, ego->bcsr, ego->bcsi,
|
||||
batchsz, 1, 1);
|
||||
X(cpy2d_co)(buf, O,
|
||||
ego->n, WS(ego->bcsr, 1), WS(ego->csr, 1),
|
||||
batchsz, 1, ego->ovs, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static void dobatch_hc2r(const P *ego, R *I, R *O, R *buf, INT batchsz)
|
||||
{
|
||||
if (IABS(WS(ego->csr, 1)) < IABS(ego->ivs)) {
|
||||
/* transform directly from input */
|
||||
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
|
||||
I, I + ego->ioffset,
|
||||
ego->brs, ego->csr, ego->csi,
|
||||
batchsz, ego->ivs, 1);
|
||||
} else {
|
||||
/* copy into buffer and transform in place */
|
||||
X(cpy2d_ci)(I, buf,
|
||||
ego->n, WS(ego->csr, 1), WS(ego->bcsr, 1),
|
||||
batchsz, ego->ivs, 1, 1);
|
||||
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
|
||||
buf, buf + ego->bioffset,
|
||||
ego->brs, ego->bcsr, ego->bcsi,
|
||||
batchsz, 1, 1);
|
||||
}
|
||||
X(cpy2d_co)(buf, O,
|
||||
ego->n, WS(ego->bcsr /* hack */, 1), ego->rs0,
|
||||
batchsz, 1, ego->ovs, 1);
|
||||
}
|
||||
|
||||
static void iterate(const P *ego, R *I, R *O,
|
||||
void (*dobatch)(const P *ego, R *I, R *O,
|
||||
R *buf, INT batchsz))
|
||||
{
|
||||
R *buf;
|
||||
INT vl = ego->vl;
|
||||
INT n = ego->n;
|
||||
INT i;
|
||||
INT batchsz = compute_batchsize(n);
|
||||
size_t bufsz = n * batchsz * sizeof(R);
|
||||
|
||||
BUF_ALLOC(R *, buf, bufsz);
|
||||
|
||||
for (i = 0; i < vl - batchsz; i += batchsz) {
|
||||
dobatch(ego, I, O, buf, batchsz);
|
||||
I += batchsz * ego->ivs;
|
||||
O += batchsz * ego->ovs;
|
||||
}
|
||||
dobatch(ego, I, O, buf, vl - i);
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
static void apply_buf_r2hc(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
iterate((const P *) ego_, I, O, dobatch_r2hc);
|
||||
}
|
||||
|
||||
static void apply_buf_hc2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
iterate((const P *) ego_, I, O, dobatch_hc2r);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(stride_destroy)(ego->rs);
|
||||
X(stride_destroy)(ego->csr);
|
||||
X(stride_destroy)(ego->csi);
|
||||
X(stride_destroy)(ego->brs);
|
||||
X(stride_destroy)(ego->bcsr);
|
||||
X(stride_destroy)(ego->bcsi);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->slv;
|
||||
|
||||
if (ego->slv->bufferedp)
|
||||
p->print(p, "(rdft-%s-directbuf/%D-r2c-%D%v \"%s\")",
|
||||
X(rdft_kind_str)(s->desc->genus->kind),
|
||||
/* hack */ WS(ego->bcsr, 1), ego->n,
|
||||
ego->vl, s->desc->nam);
|
||||
|
||||
else
|
||||
p->print(p, "(rdft-%s-direct-r2c-%D%v \"%s\")",
|
||||
X(rdft_kind_str)(s->desc->genus->kind), ego->n,
|
||||
ego->vl, s->desc->nam);
|
||||
}
|
||||
|
||||
static INT ioffset(rdft_kind kind, INT sz, INT s)
|
||||
{
|
||||
return(s * ((kind == R2HC || kind == HC2R) ? sz : (sz - 1)));
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const kr2c_desc *desc = ego->desc;
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
INT vl, ivs, ovs;
|
||||
|
||||
return (
|
||||
1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->dims[0].n == desc->n
|
||||
&& p->kind[0] == desc->genus->kind
|
||||
|
||||
/* check strides etc */
|
||||
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
|
||||
|
||||
&& (0
|
||||
/* can operate out-of-place */
|
||||
|| p->I != p->O
|
||||
|
||||
/* computing one transform */
|
||||
|| vl == 1
|
||||
|
||||
/* can operate in-place as long as strides are the same */
|
||||
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable_buf(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const kr2c_desc *desc = ego->desc;
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
INT vl, ivs, ovs, batchsz;
|
||||
|
||||
return (
|
||||
1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->dims[0].n == desc->n
|
||||
&& p->kind[0] == desc->genus->kind
|
||||
|
||||
/* check strides etc */
|
||||
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
|
||||
|
||||
&& (batchsz = compute_batchsize(desc->n), 1)
|
||||
|
||||
&& (0
|
||||
/* can operate out-of-place */
|
||||
|| p->I != p->O
|
||||
|
||||
/* can operate in-place as long as strides are the same */
|
||||
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
|
||||
|
||||
/* can do it if the problem fits in the buffer, no matter
|
||||
what the strides are */
|
||||
|| vl <= batchsz
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const problem_rdft *p;
|
||||
iodim *d;
|
||||
INT rs, cs, b, n;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), X(null_awake), print, destroy
|
||||
};
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (ego->bufferedp) {
|
||||
if (!applicable_buf(ego_, p_))
|
||||
return (plan *)0;
|
||||
} else {
|
||||
if (!applicable(ego_, p_))
|
||||
return (plan *)0;
|
||||
}
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
|
||||
if (R2HC_KINDP(p->kind[0])) {
|
||||
rs = p->sz->dims[0].is; cs = p->sz->dims[0].os;
|
||||
pln = MKPLAN_RDFT(P, &padt,
|
||||
ego->bufferedp ? apply_buf_r2hc : apply_r2hc);
|
||||
} else {
|
||||
rs = p->sz->dims[0].os; cs = p->sz->dims[0].is;
|
||||
pln = MKPLAN_RDFT(P, &padt,
|
||||
ego->bufferedp ? apply_buf_hc2r : apply_hc2r);
|
||||
}
|
||||
|
||||
d = p->sz->dims;
|
||||
n = d[0].n;
|
||||
|
||||
pln->k = ego->k;
|
||||
pln->n = n;
|
||||
|
||||
pln->rs0 = rs;
|
||||
pln->rs = X(mkstride)(n, 2 * rs);
|
||||
pln->csr = X(mkstride)(n, cs);
|
||||
pln->csi = X(mkstride)(n, -cs);
|
||||
pln->ioffset = ioffset(p->kind[0], n, cs);
|
||||
|
||||
b = compute_batchsize(n);
|
||||
pln->brs = X(mkstride)(n, 2 * b);
|
||||
pln->bcsr = X(mkstride)(n, b);
|
||||
pln->bcsi = X(mkstride)(n, -b);
|
||||
pln->bioffset = ioffset(p->kind[0], n, b);
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
|
||||
|
||||
pln->slv = ego;
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
|
||||
X(ops_madd2)(pln->vl / ego->desc->genus->vl,
|
||||
&ego->desc->ops,
|
||||
&pln->super.super.ops);
|
||||
|
||||
if (ego->bufferedp)
|
||||
pln->super.super.ops.other += 2 * n * pln->vl;
|
||||
|
||||
pln->super.super.could_prune_now_p = !ego->bufferedp;
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
static solver *mksolver(kr2c k, const kr2c_desc *desc, int bufferedp)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->k = k;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
solver *X(mksolver_rdft_r2c_direct)(kr2c k, const kr2c_desc *desc)
|
||||
{
|
||||
return mksolver(k, desc, 0);
|
||||
}
|
||||
|
||||
solver *X(mksolver_rdft_r2c_directbuf)(kr2c k, const kr2c_desc *desc)
|
||||
{
|
||||
return mksolver(k, desc, 1);
|
||||
}
|
||||
145
fftw-3.3.10/rdft/direct-r2r.c
Normal file
145
fftw-3.3.10/rdft/direct-r2r.c
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* direct RDFT solver, using r2r codelets */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
const kr2r_desc *desc;
|
||||
kr2r k;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
|
||||
INT vl, ivs, ovs;
|
||||
stride is, os;
|
||||
kr2r k;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
ego->k(I, O, ego->is, ego->os, ego->vl, ego->ivs, ego->ovs);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(stride_destroy)(ego->is);
|
||||
X(stride_destroy)(ego->os);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->slv;
|
||||
|
||||
p->print(p, "(rdft-%s-direct-r2r-%D%v \"%s\")",
|
||||
X(rdft_kind_str)(s->desc->kind), s->desc->n,
|
||||
ego->vl, s->desc->nam);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
INT vl;
|
||||
INT ivs, ovs;
|
||||
|
||||
return (
|
||||
1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->dims[0].n == ego->desc->n
|
||||
&& p->kind[0] == ego->desc->kind
|
||||
|
||||
/* check strides etc */
|
||||
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
|
||||
|
||||
&& (0
|
||||
/* can operate out-of-place */
|
||||
|| p->I != p->O
|
||||
|
||||
/* computing one transform */
|
||||
|| vl == 1
|
||||
|
||||
/* can operate in-place as long as strides are the same */
|
||||
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const problem_rdft *p;
|
||||
iodim *d;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), X(null_awake), print, destroy
|
||||
};
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (!applicable(ego_, p_))
|
||||
return (plan *)0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, apply);
|
||||
|
||||
d = p->sz->dims;
|
||||
|
||||
pln->k = ego->k;
|
||||
|
||||
pln->is = X(mkstride)(d->n, d->is);
|
||||
pln->os = X(mkstride)(d->n, d->os);
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
|
||||
|
||||
pln->slv = ego;
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(pln->vl / ego->desc->genus->vl,
|
||||
&ego->desc->ops,
|
||||
&pln->super.super.ops);
|
||||
|
||||
pln->super.super.could_prune_now_p = 1;
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
solver *X(mksolver_rdft_r2r_direct)(kr2r k, const kr2r_desc *desc)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->k = k;
|
||||
slv->desc = desc;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
171
fftw-3.3.10/rdft/direct2.c
Normal file
171
fftw-3.3.10/rdft/direct2.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* direct RDFT2 R2HC/HC2R solver, if we have a codelet */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
const kr2c_desc *desc;
|
||||
kr2c k;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft2 super;
|
||||
|
||||
stride rs, cs;
|
||||
INT vl;
|
||||
INT ivs, ovs;
|
||||
kr2c k;
|
||||
const S *slv;
|
||||
INT ilast;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
ego->k(r0, r1, cr, ci,
|
||||
ego->rs, ego->cs, ego->cs,
|
||||
ego->vl, ego->ivs, ego->ovs);
|
||||
}
|
||||
|
||||
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i, vl = ego->vl, ovs = ego->ovs;
|
||||
ASSERT_ALIGNED_DOUBLE;
|
||||
ego->k(r0, r1, cr, ci,
|
||||
ego->rs, ego->cs, ego->cs,
|
||||
vl, ego->ivs, ovs);
|
||||
for (i = 0; i < vl; ++i, ci += ovs)
|
||||
ci[0] = ci[ego->ilast] = 0;
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(stride_destroy)(ego->rs);
|
||||
X(stride_destroy)(ego->cs);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->slv;
|
||||
|
||||
p->print(p, "(rdft2-%s-direct-%D%v \"%s\")",
|
||||
X(rdft_kind_str)(s->desc->genus->kind), s->desc->n,
|
||||
ego->vl, s->desc->nam);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const kr2c_desc *desc = ego->desc;
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
INT vl;
|
||||
INT ivs, ovs;
|
||||
|
||||
return (
|
||||
1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->dims[0].n == desc->n
|
||||
&& p->kind == desc->genus->kind
|
||||
|
||||
/* check strides etc */
|
||||
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
|
||||
|
||||
&& (0
|
||||
/* can operate out-of-place */
|
||||
|| p->r0 != p->cr
|
||||
|
||||
/*
|
||||
* can compute one transform in-place, no matter
|
||||
* what the strides are.
|
||||
*/
|
||||
|| p->vecsz->rnk == 0
|
||||
|
||||
/* can operate in-place as long as strides are the same */
|
||||
|| X(rdft2_inplace_strides)(p, RNK_MINFTY)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const problem_rdft2 *p;
|
||||
iodim *d;
|
||||
int r2hc_kindp;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), X(null_awake), print, destroy
|
||||
};
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (!applicable(ego_, p_))
|
||||
return (plan *)0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
|
||||
r2hc_kindp = R2HC_KINDP(p->kind);
|
||||
A(r2hc_kindp || HC2R_KINDP(p->kind));
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, p->kind == R2HC ? apply_r2hc : apply);
|
||||
|
||||
d = p->sz->dims;
|
||||
|
||||
pln->k = ego->k;
|
||||
|
||||
pln->rs = X(mkstride)(d->n, r2hc_kindp ? d->is : d->os);
|
||||
pln->cs = X(mkstride)(d->n, r2hc_kindp ? d->os : d->is);
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
|
||||
|
||||
/* Nyquist freq., if any */
|
||||
pln->ilast = (d->n % 2) ? 0 : (d->n/2) * d->os;
|
||||
|
||||
pln->slv = ego;
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(pln->vl / ego->desc->genus->vl,
|
||||
&ego->desc->ops,
|
||||
&pln->super.super.ops);
|
||||
if (p->kind == R2HC)
|
||||
pln->super.super.ops.other += 2 * pln->vl; /* + 2 stores */
|
||||
|
||||
pln->super.super.could_prune_now_p = 1;
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
solver *X(mksolver_rdft2_direct)(kr2c k, const kr2c_desc *desc)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->k = k;
|
||||
slv->desc = desc;
|
||||
return &(slv->super);
|
||||
}
|
||||
232
fftw-3.3.10/rdft/generic.c
Normal file
232
fftw-3.3.10/rdft/generic.c
Normal file
@@ -0,0 +1,232 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
rdft_kind kind;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
twid *td;
|
||||
INT n, is, os;
|
||||
rdft_kind kind;
|
||||
} P;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
static void cdot_r2hc(INT n, const E *x, const R *w, R *or0, R *oi1)
|
||||
{
|
||||
INT i;
|
||||
|
||||
E rr = x[0], ri = 0;
|
||||
x += 1;
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
rr += x[0] * w[0];
|
||||
ri += x[1] * w[1];
|
||||
x += 2; w += 2;
|
||||
}
|
||||
*or0 = rr;
|
||||
*oi1 = ri;
|
||||
}
|
||||
|
||||
static void hartley_r2hc(INT n, const R *xr, INT xs, E *o, R *pr)
|
||||
{
|
||||
INT i;
|
||||
E sr;
|
||||
o[0] = sr = xr[0]; o += 1;
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
R a, b;
|
||||
a = xr[i * xs];
|
||||
b = xr[(n - i) * xs];
|
||||
sr += (o[0] = a + b);
|
||||
#if FFT_SIGN == -1
|
||||
o[1] = b - a;
|
||||
#else
|
||||
o[1] = a - b;
|
||||
#endif
|
||||
o += 2;
|
||||
}
|
||||
*pr = sr;
|
||||
}
|
||||
|
||||
static void apply_r2hc(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i;
|
||||
INT n = ego->n, is = ego->is, os = ego->os;
|
||||
const R *W = ego->td->W;
|
||||
E *buf;
|
||||
size_t bufsz = n * sizeof(E);
|
||||
|
||||
BUF_ALLOC(E *, buf, bufsz);
|
||||
hartley_r2hc(n, I, is, buf, O);
|
||||
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
cdot_r2hc(n, buf, W, O + i * os, O + (n - i) * os);
|
||||
W += n - 1;
|
||||
}
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
|
||||
static void cdot_hc2r(INT n, const E *x, const R *w, R *or0, R *or1)
|
||||
{
|
||||
INT i;
|
||||
|
||||
E rr = x[0], ii = 0;
|
||||
x += 1;
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
rr += x[0] * w[0];
|
||||
ii += x[1] * w[1];
|
||||
x += 2; w += 2;
|
||||
}
|
||||
#if FFT_SIGN == -1
|
||||
*or0 = rr - ii;
|
||||
*or1 = rr + ii;
|
||||
#else
|
||||
*or0 = rr + ii;
|
||||
*or1 = rr - ii;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void hartley_hc2r(INT n, const R *x, INT xs, E *o, R *pr)
|
||||
{
|
||||
INT i;
|
||||
E sr;
|
||||
|
||||
o[0] = sr = x[0]; o += 1;
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
sr += (o[0] = x[i * xs] + x[i * xs]);
|
||||
o[1] = x[(n - i) * xs] + x[(n - i) * xs];
|
||||
o += 2;
|
||||
}
|
||||
*pr = sr;
|
||||
}
|
||||
|
||||
static void apply_hc2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i;
|
||||
INT n = ego->n, is = ego->is, os = ego->os;
|
||||
const R *W = ego->td->W;
|
||||
E *buf;
|
||||
size_t bufsz = n * sizeof(E);
|
||||
|
||||
BUF_ALLOC(E *, buf, bufsz);
|
||||
hartley_hc2r(n, I, is, buf, O);
|
||||
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
cdot_hc2r(n, buf, W, O + i * os, O + (n - i) * os);
|
||||
W += n - 1;
|
||||
}
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
static const tw_instr half_tw[] = {
|
||||
{ TW_HALF, 1, 0 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n,
|
||||
(ego->n - 1) / 2);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
p->print(p, "(rdft-generic-%s-%D)",
|
||||
ego->kind == R2HC ? "r2hc" : "hc2r",
|
||||
ego->n);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
&& (p->sz->dims[0].n % 2) == 1
|
||||
&& CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
|
||||
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
|
||||
&& X(is_prime)(p->sz->dims[0].n)
|
||||
&& p->kind[0] == ego->kind
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *)ego_;
|
||||
const problem_rdft *p;
|
||||
P *pln;
|
||||
INT n;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, X(plan_null_destroy)
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
pln = MKPLAN_RDFT(P, &padt,
|
||||
R2HC_KINDP(p->kind[0]) ? apply_r2hc : apply_hc2r);
|
||||
|
||||
pln->n = n = p->sz->dims[0].n;
|
||||
pln->is = p->sz->dims[0].is;
|
||||
pln->os = p->sz->dims[0].os;
|
||||
pln->td = 0;
|
||||
pln->kind = ego->kind;
|
||||
|
||||
pln->super.super.ops.add = (n-1) * 2.5;
|
||||
pln->super.super.ops.mul = 0;
|
||||
pln->super.super.ops.fma = 0.5 * (n-1) * (n-1) ;
|
||||
#if 0 /* these are nice pipelined sequential loads and should cost nothing */
|
||||
pln->super.super.ops.other = (n-1)*(2 + 1 + (n-1)); /* approximate */
|
||||
#endif
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(rdft_kind kind)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->kind = kind;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft_generic_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver(R2HC));
|
||||
REGISTER_SOLVER(p, mksolver(HC2R));
|
||||
}
|
||||
279
fftw-3.3.10/rdft/hc2hc-direct.c
Normal file
279
fftw-3.3.10/rdft/hc2hc-direct.c
Normal file
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/hc2hc.h"
|
||||
|
||||
typedef struct {
|
||||
hc2hc_solver super;
|
||||
const hc2hc_desc *desc;
|
||||
khc2hc k;
|
||||
int bufferedp;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_hc2hc super;
|
||||
khc2hc k;
|
||||
plan *cld0, *cldm; /* children for 0th and middle butterflies */
|
||||
INT r, m, v;
|
||||
INT ms, vs, mb, me;
|
||||
stride rs, brs;
|
||||
twid *td;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
/*************************************************************
|
||||
Nonbuffered code
|
||||
*************************************************************/
|
||||
static void apply(const plan *ego_, R *IO)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld0 = (plan_rdft *) ego->cld0;
|
||||
plan_rdft *cldm = (plan_rdft *) ego->cldm;
|
||||
INT i, m = ego->m, v = ego->v;
|
||||
INT mb = ego->mb, me = ego->me;
|
||||
INT ms = ego->ms, vs = ego->vs;
|
||||
|
||||
for (i = 0; i < v; ++i, IO += vs) {
|
||||
cld0->apply((plan *) cld0, IO, IO);
|
||||
ego->k(IO + ms * mb, IO + (m - mb) * ms,
|
||||
ego->td->W, ego->rs, mb, me, ms);
|
||||
cldm->apply((plan *) cldm, IO + (m/2) * ms, IO + (m/2) * ms);
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
Buffered code
|
||||
*************************************************************/
|
||||
|
||||
/* should not be 2^k to avoid associativity conflicts */
|
||||
static INT compute_batchsize(INT radix)
|
||||
{
|
||||
/* round up to multiple of 4 */
|
||||
radix += 3;
|
||||
radix &= -4;
|
||||
|
||||
return (radix + 2);
|
||||
}
|
||||
|
||||
static void dobatch(const P *ego, R *IOp, R *IOm,
|
||||
INT mb, INT me, R *bufp)
|
||||
{
|
||||
INT b = WS(ego->brs, 1);
|
||||
INT rs = WS(ego->rs, 1);
|
||||
INT r = ego->r;
|
||||
INT ms = ego->ms;
|
||||
R *bufm = bufp + b - 1;
|
||||
|
||||
X(cpy2d_ci)(IOp + mb * ms, bufp, r, rs, b, me - mb, ms, 1, 1);
|
||||
X(cpy2d_ci)(IOm - mb * ms, bufm, r, rs, b, me - mb, -ms, -1, 1);
|
||||
|
||||
ego->k(bufp, bufm, ego->td->W, ego->brs, mb, me, 1);
|
||||
|
||||
X(cpy2d_co)(bufp, IOp + mb * ms, r, b, rs, me - mb, 1, ms, 1);
|
||||
X(cpy2d_co)(bufm, IOm - mb * ms, r, b, rs, me - mb, -1, -ms, 1);
|
||||
}
|
||||
|
||||
static void apply_buf(const plan *ego_, R *IO)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld0 = (plan_rdft *) ego->cld0;
|
||||
plan_rdft *cldm = (plan_rdft *) ego->cldm;
|
||||
INT i, j, m = ego->m, v = ego->v, r = ego->r;
|
||||
INT mb = ego->mb, me = ego->me, ms = ego->ms;
|
||||
INT batchsz = compute_batchsize(r);
|
||||
R *buf;
|
||||
size_t bufsz = r * batchsz * 2 * sizeof(R);
|
||||
|
||||
BUF_ALLOC(R *, buf, bufsz);
|
||||
|
||||
for (i = 0; i < v; ++i, IO += ego->vs) {
|
||||
R *IOp = IO;
|
||||
R *IOm = IO + m * ms;
|
||||
|
||||
cld0->apply((plan *) cld0, IO, IO);
|
||||
|
||||
for (j = mb; j + batchsz < me; j += batchsz)
|
||||
dobatch(ego, IOp, IOm, j, j + batchsz, buf);
|
||||
|
||||
dobatch(ego, IOp, IOm, j, me, buf);
|
||||
|
||||
cldm->apply((plan *) cldm, IO + ms * (m/2), IO + ms * (m/2));
|
||||
}
|
||||
|
||||
BUF_FREE(buf, bufsz);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld0, wakefulness);
|
||||
X(plan_awake)(ego->cldm, wakefulness);
|
||||
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
|
||||
ego->r * ego->m, ego->r, (ego->m - 1) / 2);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld0);
|
||||
X(plan_destroy_internal)(ego->cldm);
|
||||
X(stride_destroy)(ego->rs);
|
||||
X(stride_destroy)(ego->brs);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *slv = ego->slv;
|
||||
const hc2hc_desc *e = slv->desc;
|
||||
INT batchsz = compute_batchsize(ego->r);
|
||||
|
||||
if (slv->bufferedp)
|
||||
p->print(p, "(hc2hc-directbuf/%D-%D/%D%v \"%s\"%(%p%)%(%p%))",
|
||||
batchsz, ego->r, X(twiddle_length)(ego->r, e->tw),
|
||||
ego->v, e->nam, ego->cld0, ego->cldm);
|
||||
else
|
||||
p->print(p, "(hc2hc-direct-%D/%D%v \"%s\"%(%p%)%(%p%))",
|
||||
ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam,
|
||||
ego->cld0, ego->cldm);
|
||||
}
|
||||
|
||||
static int applicable0(const S *ego, rdft_kind kind, INT r)
|
||||
{
|
||||
const hc2hc_desc *e = ego->desc;
|
||||
|
||||
return (1
|
||||
&& r == e->radix
|
||||
&& kind == e->genus->kind
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, rdft_kind kind, INT r, INT m, INT v,
|
||||
const planner *plnr)
|
||||
{
|
||||
if (!applicable0(ego, kind, r))
|
||||
return 0;
|
||||
|
||||
if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
|
||||
v, m * r, r))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#define CLDMP(m, mstart, mcount) (2 * ((mstart) + (mcount)) == (m) + 2)
|
||||
#define CLD0P(mstart) ((mstart) == 0)
|
||||
|
||||
static plan *mkcldw(const hc2hc_solver *ego_,
|
||||
rdft_kind kind, INT r, INT m, INT ms, INT v, INT vs,
|
||||
INT mstart, INT mcount,
|
||||
R *IO, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
const hc2hc_desc *e = ego->desc;
|
||||
plan *cld0 = 0, *cldm = 0;
|
||||
INT imid = (m / 2) * ms;
|
||||
INT rs = m * ms;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, kind, r, m, v, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
cld0 = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft_1_d)((CLD0P(mstart) ?
|
||||
X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
|
||||
X(mktensor_0d)(),
|
||||
TAINT(IO, vs), TAINT(IO, vs),
|
||||
kind));
|
||||
if (!cld0) goto nada;
|
||||
|
||||
cldm = X(mkplan_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft_1_d)((CLDMP(m, mstart, mcount) ?
|
||||
X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
|
||||
X(mktensor_0d)(),
|
||||
TAINT(IO + imid, vs), TAINT(IO + imid, vs),
|
||||
kind == R2HC ? R2HCII : HC2RIII));
|
||||
if (!cldm) goto nada;
|
||||
|
||||
pln = MKPLAN_HC2HC(P, &padt, ego->bufferedp ? apply_buf : apply);
|
||||
|
||||
pln->k = ego->k;
|
||||
pln->td = 0;
|
||||
pln->r = r; pln->rs = X(mkstride)(r, rs);
|
||||
pln->m = m; pln->ms = ms;
|
||||
pln->v = v; pln->vs = vs;
|
||||
pln->slv = ego;
|
||||
pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
|
||||
pln->cld0 = cld0;
|
||||
pln->cldm = cldm;
|
||||
pln->mb = mstart + CLD0P(mstart);
|
||||
pln->me = mstart + mcount - CLDMP(m, mstart, mcount);
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
X(ops_madd2)(v * ((pln->me - pln->mb) / e->genus->vl),
|
||||
&e->ops, &pln->super.super.ops);
|
||||
X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
|
||||
X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
|
||||
|
||||
if (ego->bufferedp)
|
||||
pln->super.super.ops.other += 4 * r * (pln->me - pln->mb) * v;
|
||||
|
||||
pln->super.super.could_prune_now_p =
|
||||
(!ego->bufferedp && r >= 5 && r < 64 && m >= r);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld0);
|
||||
X(plan_destroy_internal)(cldm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void regone(planner *plnr, khc2hc codelet, const hc2hc_desc *desc,
|
||||
int bufferedp)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), desc->radix, mkcldw);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
if (X(mksolver_hc2hc_hook)) {
|
||||
slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), desc->radix, mkcldw);
|
||||
slv->k = codelet;
|
||||
slv->desc = desc;
|
||||
slv->bufferedp = bufferedp;
|
||||
REGISTER_SOLVER(plnr, &(slv->super.super));
|
||||
}
|
||||
}
|
||||
|
||||
void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet,
|
||||
const hc2hc_desc *desc)
|
||||
{
|
||||
regone(plnr, codelet, desc, /* bufferedp */0);
|
||||
regone(plnr, codelet, desc, /* bufferedp */1);
|
||||
}
|
||||
322
fftw-3.3.10/rdft/hc2hc-generic.c
Normal file
322
fftw-3.3.10/rdft/hc2hc-generic.c
Normal file
@@ -0,0 +1,322 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* express a hc2hc problem in terms of rdft + multiplication by
|
||||
twiddle factors */
|
||||
|
||||
#include "rdft/hc2hc.h"
|
||||
|
||||
typedef hc2hc_solver S;
|
||||
|
||||
typedef struct {
|
||||
plan_hc2hc super;
|
||||
|
||||
INT r, m, s, vl, vs, mstart1, mcount1;
|
||||
plan *cld0;
|
||||
plan *cld;
|
||||
twid *td;
|
||||
} P;
|
||||
|
||||
|
||||
/**************************************************************/
|
||||
static void mktwiddle(P *ego, enum wakefulness wakefulness)
|
||||
{
|
||||
static const tw_instr tw[] = { { TW_HALF, 0, 0 }, { TW_NEXT, 1, 0 } };
|
||||
|
||||
/* note that R and M are swapped, to allow for sequential
|
||||
access both to data and twiddles */
|
||||
X(twiddle_awake)(wakefulness, &ego->td, tw,
|
||||
ego->r * ego->m, ego->m, ego->r);
|
||||
}
|
||||
|
||||
static void bytwiddle(const P *ego, R *IO, R sign)
|
||||
{
|
||||
INT i, j, k;
|
||||
INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
|
||||
INT ms = m * s;
|
||||
INT mstart1 = ego->mstart1, mcount1 = ego->mcount1;
|
||||
INT wrem = 2 * ((m-1)/2 - mcount1);
|
||||
|
||||
for (i = 0; i < vl; ++i, IO += vs) {
|
||||
const R *W = ego->td->W;
|
||||
|
||||
A(m % 2 == 1);
|
||||
for (k = 1, W += (m - 1) + 2*(mstart1-1); k < r; ++k) {
|
||||
/* pr := IO + (j + mstart1) * s + k * ms */
|
||||
R *pr = IO + mstart1 * s + k * ms;
|
||||
|
||||
/* pi := IO + (m - j - mstart1) * s + k * ms */
|
||||
R *pi = IO - mstart1 * s + (k + 1) * ms;
|
||||
|
||||
for (j = 0; j < mcount1; ++j, pr += s, pi -= s) {
|
||||
E xr = *pr;
|
||||
E xi = *pi;
|
||||
E wr = W[0];
|
||||
E wi = sign * W[1];
|
||||
*pr = xr * wr - xi * wi;
|
||||
*pi = xi * wr + xr * wi;
|
||||
W += 2;
|
||||
}
|
||||
W += wrem;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void swapri(R *IO, INT r, INT m, INT s, INT jstart, INT jend)
|
||||
{
|
||||
INT k;
|
||||
INT ms = m * s;
|
||||
INT js = jstart * s;
|
||||
for (k = 0; k + k < r; ++k) {
|
||||
/* pr := IO + (m - j) * s + k * ms */
|
||||
R *pr = IO + (k + 1) * ms - js;
|
||||
/* pi := IO + (m - j) * s + (r - 1 - k) * ms */
|
||||
R *pi = IO + (r - k) * ms - js;
|
||||
INT j;
|
||||
for (j = jstart; j < jend; j += 1, pr -= s, pi -= s) {
|
||||
R t = *pr;
|
||||
*pr = *pi;
|
||||
*pi = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_dit(const P *ego, R *IO)
|
||||
{
|
||||
INT i, k;
|
||||
INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
|
||||
INT ms = m * s;
|
||||
INT mstart1 = ego->mstart1, mend1 = mstart1 + ego->mcount1;
|
||||
|
||||
for (i = 0; i < vl; ++i, IO += vs) {
|
||||
for (k = 1; k + k < r; ++k) {
|
||||
R *p0 = IO + k * ms;
|
||||
R *p1 = IO + (r - k) * ms;
|
||||
INT j;
|
||||
|
||||
for (j = mstart1; j < mend1; ++j) {
|
||||
E rp, ip, im, rm;
|
||||
rp = p0[j * s];
|
||||
im = p1[ms - j * s];
|
||||
rm = p1[j * s];
|
||||
ip = p0[ms - j * s];
|
||||
p0[j * s] = rp - im;
|
||||
p1[ms - j * s] = rp + im;
|
||||
p1[j * s] = rm - ip;
|
||||
p0[ms - j * s] = ip + rm;
|
||||
}
|
||||
}
|
||||
|
||||
swapri(IO, r, m, s, mstart1, mend1);
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_dif(const P *ego, R *IO)
|
||||
{
|
||||
INT i, k;
|
||||
INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
|
||||
INT ms = m * s;
|
||||
INT mstart1 = ego->mstart1, mend1 = mstart1 + ego->mcount1;
|
||||
|
||||
for (i = 0; i < vl; ++i, IO += vs) {
|
||||
swapri(IO, r, m, s, mstart1, mend1);
|
||||
|
||||
for (k = 1; k + k < r; ++k) {
|
||||
R *p0 = IO + k * ms;
|
||||
R *p1 = IO + (r - k) * ms;
|
||||
const R half = K(0.5);
|
||||
INT j;
|
||||
|
||||
for (j = mstart1; j < mend1; ++j) {
|
||||
E rp, ip, im, rm;
|
||||
rp = half * p0[j * s];
|
||||
im = half * p1[ms - j * s];
|
||||
rm = half * p1[j * s];
|
||||
ip = half * p0[ms - j * s];
|
||||
p0[j * s] = rp + im;
|
||||
p1[ms - j * s] = im - rp;
|
||||
p1[j * s] = rm + ip;
|
||||
p0[ms - j * s] = ip - rm;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable(rdft_kind kind, INT r, INT m, const planner *plnr)
|
||||
{
|
||||
return (1
|
||||
&& (kind == R2HC || kind == HC2R)
|
||||
&& (m % 2)
|
||||
&& (r % 2)
|
||||
&& !NO_SLOWP(plnr)
|
||||
);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
|
||||
static void apply_dit(const plan *ego_, R *IO)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT start;
|
||||
plan_rdft *cld, *cld0;
|
||||
|
||||
bytwiddle(ego, IO, K(-1.0));
|
||||
|
||||
cld0 = (plan_rdft *) ego->cld0;
|
||||
cld0->apply(ego->cld0, IO, IO);
|
||||
|
||||
start = ego->mstart1 * ego->s;
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, IO + start, IO + start);
|
||||
|
||||
reorder_dit(ego, IO);
|
||||
}
|
||||
|
||||
static void apply_dif(const plan *ego_, R *IO)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT start;
|
||||
plan_rdft *cld, *cld0;
|
||||
|
||||
reorder_dif(ego, IO);
|
||||
|
||||
cld0 = (plan_rdft *) ego->cld0;
|
||||
cld0->apply(ego->cld0, IO, IO);
|
||||
|
||||
start = ego->mstart1 * ego->s;
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, IO + start, IO + start);
|
||||
|
||||
bytwiddle(ego, IO, K(1.0));
|
||||
}
|
||||
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld0, wakefulness);
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
mktwiddle(ego, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
X(plan_destroy_internal)(ego->cld0);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(hc2hc-generic-%s-%D-%D%v%(%p%)%(%p%))",
|
||||
ego->super.apply == apply_dit ? "dit" : "dif",
|
||||
ego->r, ego->m, ego->vl, ego->cld0, ego->cld);
|
||||
}
|
||||
|
||||
static plan *mkcldw(const hc2hc_solver *ego_,
|
||||
rdft_kind kind, INT r, INT m, INT s, INT vl, INT vs,
|
||||
INT mstart, INT mcount,
|
||||
R *IO, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
plan *cld0 = 0, *cld = 0;
|
||||
INT mstart1, mcount1, mstride;
|
||||
|
||||
static const plan_adt padt = {
|
||||
0, awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego_);
|
||||
|
||||
A(mstart >= 0 && mcount > 0 && mstart + mcount <= (m+2)/2);
|
||||
|
||||
if (!applicable(kind, r, m, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
A(m % 2);
|
||||
mstart1 = mstart + (mstart == 0);
|
||||
mcount1 = mcount - (mstart == 0);
|
||||
mstride = m - (mstart + mcount - 1) - mstart1;
|
||||
|
||||
/* 0th (DC) transform (vl of these), if mstart == 0 */
|
||||
cld0 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_1_d)(
|
||||
mstart == 0 ? X(mktensor_1d)(r, m * s, m * s)
|
||||
: X(mktensor_0d)(),
|
||||
X(mktensor_1d)(vl, vs, vs),
|
||||
IO, IO, kind)
|
||||
);
|
||||
if (!cld0) goto nada;
|
||||
|
||||
/* twiddle transforms: there are 2 x mcount1 x vl of these
|
||||
(where 2 corresponds to the real and imaginary parts) ...
|
||||
the 2 x mcount1 loops are combined if mstart=0 and mcount=(m+2)/2. */
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_1_d)(
|
||||
X(mktensor_1d)(r, m * s, m * s),
|
||||
X(mktensor_3d)(2, mstride * s, mstride * s,
|
||||
mcount1, s, s,
|
||||
vl, vs, vs),
|
||||
IO + s * mstart1, IO + s * mstart1, kind)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_HC2HC(P, &padt, (kind == R2HC) ? apply_dit : apply_dif);
|
||||
pln->cld = cld;
|
||||
pln->cld0 = cld0;
|
||||
pln->r = r;
|
||||
pln->m = m;
|
||||
pln->s = s;
|
||||
pln->vl = vl;
|
||||
pln->vs = vs;
|
||||
pln->td = 0;
|
||||
pln->mstart1 = mstart1;
|
||||
pln->mcount1 = mcount1;
|
||||
|
||||
{
|
||||
double n0 = 0.5 * (r - 1) * (2 * mcount1) * vl;
|
||||
pln->super.super.ops = cld->ops;
|
||||
pln->super.super.ops.mul += (kind == R2HC ? 5.0 : 7.0) * n0;
|
||||
pln->super.super.ops.add += 4.0 * n0;
|
||||
pln->super.super.ops.other += 11.0 * n0;
|
||||
}
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld);
|
||||
X(plan_destroy_internal)(cld0);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static void regsolver(planner *plnr, INT r)
|
||||
{
|
||||
S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), r, mkcldw);
|
||||
REGISTER_SOLVER(plnr, &(slv->super));
|
||||
if (X(mksolver_hc2hc_hook)) {
|
||||
slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), r, mkcldw);
|
||||
REGISTER_SOLVER(plnr, &(slv->super));
|
||||
}
|
||||
}
|
||||
|
||||
void X(hc2hc_generic_register)(planner *p)
|
||||
{
|
||||
regsolver(p, 0);
|
||||
}
|
||||
214
fftw-3.3.10/rdft/hc2hc.c
Normal file
214
fftw-3.3.10/rdft/hc2hc.c
Normal file
@@ -0,0 +1,214 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/hc2hc.h"
|
||||
|
||||
hc2hc_solver *(*X(mksolver_hc2hc_hook))(size_t, INT, hc2hc_mkinferior) = 0;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
plan *cld;
|
||||
plan *cldw;
|
||||
INT r;
|
||||
} P;
|
||||
|
||||
static void apply_dit(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld;
|
||||
plan_hc2hc *cldw;
|
||||
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, I, O);
|
||||
|
||||
cldw = (plan_hc2hc *) ego->cldw;
|
||||
cldw->apply(ego->cldw, O);
|
||||
}
|
||||
|
||||
static void apply_dif(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld;
|
||||
plan_hc2hc *cldw;
|
||||
|
||||
cldw = (plan_hc2hc *) ego->cldw;
|
||||
cldw->apply(ego->cldw, I);
|
||||
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, I, O);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldw, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldw);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(rdft-ct-%s/%D%(%p%)%(%p%))",
|
||||
ego->super.apply == apply_dit ? "dit" : "dif",
|
||||
ego->r, ego->cldw, ego->cld);
|
||||
}
|
||||
|
||||
static int applicable0(const hc2hc_solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
INT r;
|
||||
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk <= 1
|
||||
|
||||
&& (/* either the problem is R2HC, which is solved by DIT */
|
||||
(p->kind[0] == R2HC)
|
||||
||
|
||||
/* or the problem is HC2R, in which case it is solved
|
||||
by DIF, which destroys the input */
|
||||
(p->kind[0] == HC2R &&
|
||||
(p->I == p->O || !NO_DESTROY_INPUTP(plnr))))
|
||||
|
||||
&& ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 0)
|
||||
&& p->sz->dims[0].n > r);
|
||||
}
|
||||
|
||||
int X(hc2hc_applicable)(const hc2hc_solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_rdft *p;
|
||||
|
||||
if (!applicable0(ego, p_, plnr))
|
||||
return 0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
|
||||
return (0
|
||||
|| p->vecsz->rnk == 0
|
||||
|| !NO_VRECURSEP(plnr)
|
||||
);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const hc2hc_solver *ego = (const hc2hc_solver *) ego_;
|
||||
const problem_rdft *p;
|
||||
P *pln = 0;
|
||||
plan *cld = 0, *cldw = 0;
|
||||
INT n, r, m, v, ivs, ovs;
|
||||
iodim *d;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (NO_NONTHREADEDP(plnr) || !X(hc2hc_applicable)(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
d = p->sz->dims;
|
||||
n = d[0].n;
|
||||
r = X(choose_radix)(ego->r, n);
|
||||
m = n / r;
|
||||
|
||||
X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
|
||||
|
||||
switch (p->kind[0]) {
|
||||
case R2HC:
|
||||
cldw = ego->mkcldw(ego,
|
||||
R2HC, r, m, d[0].os, v, ovs, 0, (m+2)/2,
|
||||
p->O, plnr);
|
||||
if (!cldw) goto nada;
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(mktensor_1d)(m, r * d[0].is, d[0].os),
|
||||
X(mktensor_2d)(r, d[0].is, m * d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->I, p->O, p->kind)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, apply_dit);
|
||||
break;
|
||||
|
||||
case HC2R:
|
||||
cldw = ego->mkcldw(ego,
|
||||
HC2R, r, m, d[0].is, v, ivs, 0, (m+2)/2,
|
||||
p->I, plnr);
|
||||
if (!cldw) goto nada;
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(mktensor_1d)(m, d[0].is, r * d[0].os),
|
||||
X(mktensor_2d)(r, m * d[0].is, d[0].os,
|
||||
v, ivs, ovs),
|
||||
p->I, p->O, p->kind)
|
||||
);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, apply_dif);
|
||||
break;
|
||||
|
||||
default:
|
||||
A(0);
|
||||
}
|
||||
|
||||
pln->cld = cld;
|
||||
pln->cldw = cldw;
|
||||
pln->r = r;
|
||||
X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
|
||||
|
||||
/* inherit could_prune_now_p attribute from cldw */
|
||||
pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldw);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
hc2hc_solver *X(mksolver_hc2hc)(size_t size, INT r, hc2hc_mkinferior mkcldw)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
hc2hc_solver *slv = (hc2hc_solver *)X(mksolver)(size, &sadt);
|
||||
slv->r = r;
|
||||
slv->mkcldw = mkcldw;
|
||||
return slv;
|
||||
}
|
||||
|
||||
plan *X(mkplan_hc2hc)(size_t size, const plan_adt *adt, hc2hcapply apply)
|
||||
{
|
||||
plan_hc2hc *ego;
|
||||
|
||||
ego = (plan_hc2hc *) X(mkplan)(size, adt);
|
||||
ego->apply = apply;
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
54
fftw-3.3.10/rdft/hc2hc.h
Normal file
54
fftw-3.3.10/rdft/hc2hc.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef void (*hc2hcapply) (const plan *ego, R *IO);
|
||||
typedef struct hc2hc_solver_s hc2hc_solver;
|
||||
typedef plan *(*hc2hc_mkinferior)(const hc2hc_solver *ego,
|
||||
rdft_kind kind, INT r, INT m, INT s,
|
||||
INT vl, INT vs, INT mstart, INT mcount,
|
||||
R *IO, planner *plnr);
|
||||
|
||||
typedef struct {
|
||||
plan super;
|
||||
hc2hcapply apply;
|
||||
} plan_hc2hc;
|
||||
|
||||
extern plan *X(mkplan_hc2hc)(size_t size, const plan_adt *adt,
|
||||
hc2hcapply apply);
|
||||
|
||||
#define MKPLAN_HC2HC(type, adt, apply) \
|
||||
(type *)X(mkplan_hc2hc)(sizeof(type), adt, apply)
|
||||
|
||||
struct hc2hc_solver_s {
|
||||
solver super;
|
||||
INT r;
|
||||
|
||||
hc2hc_mkinferior mkcldw;
|
||||
};
|
||||
|
||||
hc2hc_solver *X(mksolver_hc2hc)(size_t size, INT r, hc2hc_mkinferior mkcldw);
|
||||
extern hc2hc_solver *(*X(mksolver_hc2hc_hook))(size_t, INT, hc2hc_mkinferior);
|
||||
|
||||
void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet,
|
||||
const hc2hc_desc *desc);
|
||||
|
||||
int X(hc2hc_applicable)(const hc2hc_solver *, const problem *, planner *);
|
||||
234
fftw-3.3.10/rdft/indirect.c
Normal file
234
fftw-3.3.10/rdft/indirect.c
Normal file
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/* solvers/plans for vectors of small RDFT's that cannot be done
|
||||
in-place directly. Use a rank-0 plan to rearrange the data
|
||||
before or after the transform. Can also change an out-of-place
|
||||
plan into a copy + in-place (where the in-place transform
|
||||
is e.g. unit stride). */
|
||||
|
||||
/* FIXME: merge with rank-geq2.c(?), since this is just a special case
|
||||
of a rank split where the first/second transform has rank 0. */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef problem *(*mkcld_t) (const problem_rdft *p);
|
||||
|
||||
typedef struct {
|
||||
rdftapply apply;
|
||||
problem *(*mkcld)(const problem_rdft *p);
|
||||
const char *nam;
|
||||
} ndrct_adt;
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
const ndrct_adt *adt;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
plan *cldcpy, *cld;
|
||||
const S *slv;
|
||||
} P;
|
||||
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* first rearrange, then transform */
|
||||
static void apply_before(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
{
|
||||
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
|
||||
cldcpy->apply(ego->cldcpy, I, O);
|
||||
}
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, O, O);
|
||||
}
|
||||
}
|
||||
|
||||
static problem *mkcld_before(const problem_rdft *p)
|
||||
{
|
||||
return X(mkproblem_rdft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_OS),
|
||||
X(tensor_copy_inplace)(p->vecsz, INPLACE_OS),
|
||||
p->O, p->O, p->kind);
|
||||
}
|
||||
|
||||
static const ndrct_adt adt_before =
|
||||
{
|
||||
apply_before, mkcld_before, "rdft-indirect-before"
|
||||
};
|
||||
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* first transform, then rearrange */
|
||||
|
||||
static void apply_after(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, I, I);
|
||||
}
|
||||
{
|
||||
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
|
||||
cldcpy->apply(ego->cldcpy, I, O);
|
||||
}
|
||||
}
|
||||
|
||||
static problem *mkcld_after(const problem_rdft *p)
|
||||
{
|
||||
return X(mkproblem_rdft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_IS),
|
||||
X(tensor_copy_inplace)(p->vecsz, INPLACE_IS),
|
||||
p->I, p->I, p->kind);
|
||||
}
|
||||
|
||||
static const ndrct_adt adt_after =
|
||||
{
|
||||
apply_after, mkcld_after, "rdft-indirect-after"
|
||||
};
|
||||
|
||||
/*-----------------------------------------------------------------------*/
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
X(plan_destroy_internal)(ego->cldcpy);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldcpy, wakefulness);
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->slv;
|
||||
p->print(p, "(%s%(%p%)%(%p%))", s->adt->nam, ego->cld, ego->cldcpy);
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
return (1
|
||||
&& FINITE_RNK(p->vecsz->rnk)
|
||||
|
||||
/* problem must be a nontrivial transform, not just a copy */
|
||||
&& p->sz->rnk > 0
|
||||
|
||||
&& (0
|
||||
|
||||
/* problem must be in-place & require some
|
||||
rearrangement of the data */
|
||||
|| (p->I == p->O
|
||||
&& !(X(tensor_inplace_strides2)(p->sz, p->vecsz)))
|
||||
|
||||
/* or problem must be out of place, transforming
|
||||
from stride 1/2 to bigger stride, for apply_after */
|
||||
|| (p->I != p->O && ego->adt->apply == apply_after
|
||||
&& !NO_DESTROY_INPUTP(plnr)
|
||||
&& X(tensor_min_istride)(p->sz) <= 2
|
||||
&& X(tensor_min_ostride)(p->sz) > 2)
|
||||
|
||||
/* or problem must be out of place, transforming
|
||||
to stride 1/2 from bigger stride, for apply_before */
|
||||
|| (p->I != p->O && ego->adt->apply == apply_before
|
||||
&& X(tensor_min_ostride)(p->sz) <= 2
|
||||
&& X(tensor_min_istride)(p->sz) > 2)
|
||||
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
if (!applicable0(ego_, p_, plnr)) return 0;
|
||||
|
||||
if (NO_INDIRECT_OP_P(plnr)) {
|
||||
const problem_rdft *p = (const problem_rdft *)p_;
|
||||
if (p->I != p->O) return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
plan *cld = 0, *cldcpy = 0;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
cldcpy = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(tensor_append)(p->vecsz, p->sz),
|
||||
p->I, p->O));
|
||||
if (!cldcpy) goto nada;
|
||||
|
||||
cld = X(mkplan_f_d)(plnr, ego->adt->mkcld(p), NO_BUFFERING, 0, 0);
|
||||
if (!cld) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
|
||||
pln->cld = cld;
|
||||
pln->cldcpy = cldcpy;
|
||||
pln->slv = ego;
|
||||
X(ops_add)(&cld->ops, &cldcpy->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld);
|
||||
X(plan_destroy_internal)(cldcpy);
|
||||
return (plan *)0;
|
||||
}
|
||||
|
||||
static solver *mksolver(const ndrct_adt *adt)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->adt = adt;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft_indirect_register)(planner *p)
|
||||
{
|
||||
unsigned i;
|
||||
static const ndrct_adt *const adts[] = {
|
||||
&adt_before, &adt_after
|
||||
};
|
||||
|
||||
for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(adts[i]));
|
||||
}
|
||||
28
fftw-3.3.10/rdft/khc2c.c
Normal file
28
fftw-3.3.10/rdft/khc2c.c
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "ct-hc2c.h"
|
||||
|
||||
void X(khc2c_register)(planner *p, khc2c codelet, const hc2c_desc *desc,
|
||||
hc2c_kind hc2ckind)
|
||||
{
|
||||
X(regsolver_hc2c_direct)(p, codelet, desc, hc2ckind);
|
||||
}
|
||||
27
fftw-3.3.10/rdft/khc2hc.c
Normal file
27
fftw-3.3.10/rdft/khc2hc.c
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/hc2hc.h"
|
||||
|
||||
void X(khc2hc_register)(planner *p, khc2hc codelet, const hc2hc_desc *desc)
|
||||
{
|
||||
X(regsolver_hc2hc_direct)(p, codelet, desc);
|
||||
}
|
||||
29
fftw-3.3.10/rdft/kr2c.c
Normal file
29
fftw-3.3.10/rdft/kr2c.c
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
void X(kr2c_register)(planner *p, kr2c codelet, const kr2c_desc *desc)
|
||||
{
|
||||
REGISTER_SOLVER(p, X(mksolver_rdft_r2c_direct)(codelet, desc));
|
||||
REGISTER_SOLVER(p, X(mksolver_rdft_r2c_directbuf)(codelet, desc));
|
||||
REGISTER_SOLVER(p, X(mksolver_rdft2_direct)(codelet, desc));
|
||||
}
|
||||
27
fftw-3.3.10/rdft/kr2r.c
Normal file
27
fftw-3.3.10/rdft/kr2r.c
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
void X(kr2r_register)(planner *p, kr2r codelet, const kr2r_desc *desc)
|
||||
{
|
||||
REGISTER_SOLVER(p, X(mksolver_rdft_r2r_direct)(codelet, desc));
|
||||
}
|
||||
82
fftw-3.3.10/rdft/nop.c
Normal file
82
fftw-3.3.10/rdft/nop.c
Normal file
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for vrank -infty RDFTs (nothing to do) */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
UNUSED(ego_);
|
||||
UNUSED(I);
|
||||
UNUSED(O);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
UNUSED(ego_);
|
||||
return 0
|
||||
/* case 1 : -infty vector rank */
|
||||
|| (p->vecsz->rnk == RNK_MINFTY)
|
||||
|
||||
/* case 2 : rank-0 in-place rdft */
|
||||
|| (1
|
||||
&& p->sz->rnk == 0
|
||||
&& FINITE_RNK(p->vecsz->rnk)
|
||||
&& p->O == p->I
|
||||
&& X(tensor_inplace_strides)(p->vecsz)
|
||||
);
|
||||
}
|
||||
|
||||
static void print(const plan *ego, printer *p)
|
||||
{
|
||||
UNUSED(ego);
|
||||
p->print(p, "(rdft-nop)");
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
|
||||
{
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), X(null_awake), print, X(plan_null_destroy)
|
||||
};
|
||||
plan_rdft *pln;
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (!applicable(ego, p))
|
||||
return (plan *) 0;
|
||||
pln = MKPLAN_RDFT(plan_rdft, &padt, apply);
|
||||
X(ops_zero)(&pln->super.ops);
|
||||
|
||||
return &(pln->super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
return MKSOLVER(solver, &sadt);
|
||||
}
|
||||
|
||||
void X(rdft_nop_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
90
fftw-3.3.10/rdft/nop2.c
Normal file
90
fftw-3.3.10/rdft/nop2.c
Normal file
@@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for vrank -infty RDFT2s (nothing to do), as well as in-place
|
||||
rank-0 HC2R. Note that in-place rank-0 R2HC is *not* a no-op, because
|
||||
we have to set the imaginary parts of the output to zero. */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
UNUSED(ego_);
|
||||
UNUSED(r0);
|
||||
UNUSED(r1);
|
||||
UNUSED(cr);
|
||||
UNUSED(ci);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
UNUSED(ego_);
|
||||
|
||||
return(0
|
||||
/* case 1 : -infty vector rank */
|
||||
|| (p->vecsz->rnk == RNK_MINFTY)
|
||||
|
||||
/* case 2 : rank-0 in-place rdft, except that
|
||||
R2HC is not a no-op because it sets the imaginary
|
||||
part to 0 */
|
||||
|| (1
|
||||
&& p->kind != R2HC
|
||||
&& p->sz->rnk == 0
|
||||
&& FINITE_RNK(p->vecsz->rnk)
|
||||
&& (p->r0 == p->cr)
|
||||
&& X(rdft2_inplace_strides)(p, RNK_MINFTY)
|
||||
));
|
||||
}
|
||||
|
||||
static void print(const plan *ego, printer *p)
|
||||
{
|
||||
UNUSED(ego);
|
||||
p->print(p, "(rdft2-nop)");
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
|
||||
{
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), X(null_awake), print, X(plan_null_destroy)
|
||||
};
|
||||
plan_rdft2 *pln;
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (!applicable(ego, p))
|
||||
return (plan *) 0;
|
||||
pln = MKPLAN_RDFT2(plan_rdft2, &padt, apply);
|
||||
X(ops_zero)(&pln->super.ops);
|
||||
|
||||
return &(pln->super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
return MKSOLVER(solver, &sadt);
|
||||
}
|
||||
|
||||
void X(rdft2_nop_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
32
fftw-3.3.10/rdft/plan.c
Normal file
32
fftw-3.3.10/rdft/plan.c
Normal file
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
plan *X(mkplan_rdft)(size_t size, const plan_adt *adt, rdftapply apply)
|
||||
{
|
||||
plan_rdft *ego;
|
||||
|
||||
ego = (plan_rdft *) X(mkplan)(size, adt);
|
||||
ego->apply = apply;
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
32
fftw-3.3.10/rdft/plan2.c
Normal file
32
fftw-3.3.10/rdft/plan2.c
Normal file
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
plan *X(mkplan_rdft2)(size_t size, const plan_adt *adt, rdft2apply apply)
|
||||
{
|
||||
plan_rdft2 *ego;
|
||||
|
||||
ego = (plan_rdft2 *) X(mkplan)(size, adt);
|
||||
ego->apply = apply;
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
238
fftw-3.3.10/rdft/problem.c
Normal file
238
fftw-3.3.10/rdft/problem.c
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
#include <stddef.h>
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_rdft *ego = (problem_rdft *) ego_;
|
||||
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
|
||||
X(ifree0)(ego->kind);
|
||||
#endif
|
||||
X(tensor_destroy2)(ego->vecsz, ego->sz);
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void kind_hash(md5 *m, const rdft_kind *kind, int rnk)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < rnk; ++i)
|
||||
X(md5int)(m, kind[i]);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
X(md5puts)(m, "rdft");
|
||||
X(md5int)(m, p->I == p->O);
|
||||
kind_hash(m, p->kind, p->sz->rnk);
|
||||
X(md5int)(m, X(ialignment_of)(p->I));
|
||||
X(md5int)(m, X(ialignment_of)(p->O));
|
||||
X(tensor_md5)(m, p->sz);
|
||||
X(tensor_md5)(m, p->vecsz);
|
||||
}
|
||||
|
||||
static void recur(const iodim *dims, int rnk, R *I)
|
||||
{
|
||||
if (rnk == RNK_MINFTY)
|
||||
return;
|
||||
else if (rnk == 0)
|
||||
I[0] = K(0.0);
|
||||
else if (rnk > 0) {
|
||||
INT i, n = dims[0].n, is = dims[0].is;
|
||||
|
||||
if (rnk == 1) {
|
||||
/* this case is redundant but faster */
|
||||
for (i = 0; i < n; ++i)
|
||||
I[i * is] = K(0.0);
|
||||
} else {
|
||||
for (i = 0; i < n; ++i)
|
||||
recur(dims + 1, rnk - 1, I + i * is);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void X(rdft_zerotens)(tensor *sz, R *I)
|
||||
{
|
||||
recur(sz->dims, sz->rnk, I);
|
||||
}
|
||||
|
||||
#define KSTR_LEN 8
|
||||
|
||||
const char *X(rdft_kind_str)(rdft_kind kind)
|
||||
{
|
||||
static const char kstr[][KSTR_LEN] = {
|
||||
"r2hc", "r2hc01", "r2hc10", "r2hc11",
|
||||
"hc2r", "hc2r01", "hc2r10", "hc2r11",
|
||||
"dht",
|
||||
"redft00", "redft01", "redft10", "redft11",
|
||||
"rodft00", "rodft01", "rodft10", "rodft11"
|
||||
};
|
||||
A(kind >= 0 && kind < sizeof(kstr) / KSTR_LEN);
|
||||
return kstr[kind];
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_rdft *ego = (const problem_rdft *) ego_;
|
||||
int i;
|
||||
p->print(p, "(rdft %d %D %T %T",
|
||||
X(ialignment_of)(ego->I),
|
||||
(INT)(ego->O - ego->I),
|
||||
ego->sz,
|
||||
ego->vecsz);
|
||||
for (i = 0; i < ego->sz->rnk; ++i)
|
||||
p->print(p, " %d", (int)ego->kind[i]);
|
||||
p->print(p, ")");
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_rdft *ego = (const problem_rdft *) ego_;
|
||||
tensor *sz = X(tensor_append)(ego->vecsz, ego->sz);
|
||||
X(rdft_zerotens)(sz, UNTAINT(ego->I));
|
||||
X(tensor_destroy)(sz);
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_RDFT,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
/* Dimensions of size 1 that are not REDFT/RODFT are no-ops and can be
|
||||
eliminated. REDFT/RODFT unit dimensions often have factors of 2.0
|
||||
and suchlike from normalization and phases, although in principle
|
||||
these constant factors from different dimensions could be combined. */
|
||||
static int nontrivial(const iodim *d, rdft_kind kind)
|
||||
{
|
||||
return (d->n > 1 || kind == R2HC11 || kind == HC2R11
|
||||
|| (REODFT_KINDP(kind) && kind != REDFT01 && kind != RODFT01));
|
||||
}
|
||||
|
||||
problem *X(mkproblem_rdft)(const tensor *sz, const tensor *vecsz,
|
||||
R *I, R *O, const rdft_kind *kind)
|
||||
{
|
||||
problem_rdft *ego;
|
||||
int rnk = sz->rnk;
|
||||
int i;
|
||||
|
||||
A(X(tensor_kosherp)(sz));
|
||||
A(X(tensor_kosherp)(vecsz));
|
||||
A(FINITE_RNK(sz->rnk));
|
||||
|
||||
if (UNTAINT(I) == UNTAINT(O))
|
||||
I = O = JOIN_TAINT(I, O);
|
||||
|
||||
if (I == O && !X(tensor_inplace_locations)(sz, vecsz))
|
||||
return X(mkproblem_unsolvable)();
|
||||
|
||||
for (i = rnk = 0; i < sz->rnk; ++i) {
|
||||
A(sz->dims[i].n > 0);
|
||||
if (nontrivial(sz->dims + i, kind[i]))
|
||||
++rnk;
|
||||
}
|
||||
|
||||
#if defined(STRUCT_HACK_KR)
|
||||
ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft)
|
||||
+ sizeof(rdft_kind)
|
||||
* (rnk > 0 ? rnk - 1u : 0u), &padt);
|
||||
#elif defined(STRUCT_HACK_C99)
|
||||
ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft)
|
||||
+ sizeof(rdft_kind) * (unsigned)rnk, &padt);
|
||||
#else
|
||||
ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft), &padt);
|
||||
ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * (unsigned)rnk, PROBLEMS);
|
||||
#endif
|
||||
|
||||
/* do compression and sorting as in X(tensor_compress), but take
|
||||
transform kind into account (sigh) */
|
||||
ego->sz = X(mktensor)(rnk);
|
||||
for (i = rnk = 0; i < sz->rnk; ++i) {
|
||||
if (nontrivial(sz->dims + i, kind[i])) {
|
||||
ego->kind[rnk] = kind[i];
|
||||
ego->sz->dims[rnk++] = sz->dims[i];
|
||||
}
|
||||
}
|
||||
for (i = 0; i + 1 < rnk; ++i) {
|
||||
int j;
|
||||
for (j = i + 1; j < rnk; ++j)
|
||||
if (X(dimcmp)(ego->sz->dims + i, ego->sz->dims + j) > 0) {
|
||||
iodim dswap;
|
||||
rdft_kind kswap;
|
||||
dswap = ego->sz->dims[i];
|
||||
ego->sz->dims[i] = ego->sz->dims[j];
|
||||
ego->sz->dims[j] = dswap;
|
||||
kswap = ego->kind[i];
|
||||
ego->kind[i] = ego->kind[j];
|
||||
ego->kind[j] = kswap;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < rnk; ++i)
|
||||
if (ego->sz->dims[i].n == 2 && (ego->kind[i] == REDFT00
|
||||
|| ego->kind[i] == DHT
|
||||
|| ego->kind[i] == HC2R))
|
||||
ego->kind[i] = R2HC; /* size-2 transforms are equivalent */
|
||||
|
||||
ego->vecsz = X(tensor_compress_contiguous)(vecsz);
|
||||
ego->I = I;
|
||||
ego->O = O;
|
||||
|
||||
A(FINITE_RNK(ego->sz->rnk));
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
|
||||
/* Same as X(mkproblem_rdft), but also destroy input tensors. */
|
||||
problem *X(mkproblem_rdft_d)(tensor *sz, tensor *vecsz,
|
||||
R *I, R *O, const rdft_kind *kind)
|
||||
{
|
||||
problem *p = X(mkproblem_rdft)(sz, vecsz, I, O, kind);
|
||||
X(tensor_destroy2)(vecsz, sz);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* As above, but for rnk <= 1 only and takes a scalar kind parameter */
|
||||
problem *X(mkproblem_rdft_1)(const tensor *sz, const tensor *vecsz,
|
||||
R *I, R *O, rdft_kind kind)
|
||||
{
|
||||
A(sz->rnk <= 1);
|
||||
return X(mkproblem_rdft)(sz, vecsz, I, O, &kind);
|
||||
}
|
||||
|
||||
problem *X(mkproblem_rdft_1_d)(tensor *sz, tensor *vecsz,
|
||||
R *I, R *O, rdft_kind kind)
|
||||
{
|
||||
A(sz->rnk <= 1);
|
||||
return X(mkproblem_rdft_d)(sz, vecsz, I, O, &kind);
|
||||
}
|
||||
|
||||
/* create a zero-dimensional problem */
|
||||
problem *X(mkproblem_rdft_0_d)(tensor *vecsz, R *I, R *O)
|
||||
{
|
||||
return X(mkproblem_rdft_d)(X(mktensor_0d)(), vecsz, I, O,
|
||||
(const rdft_kind *)0);
|
||||
}
|
||||
224
fftw-3.3.10/rdft/problem2.c
Normal file
224
fftw-3.3.10/rdft/problem2.c
Normal file
@@ -0,0 +1,224 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "dft/dft.h"
|
||||
#include "rdft/rdft.h"
|
||||
#include <stddef.h>
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_rdft2 *ego = (problem_rdft2 *) ego_;
|
||||
X(tensor_destroy2)(ego->vecsz, ego->sz);
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
X(md5puts)(m, "rdft2");
|
||||
X(md5int)(m, p->r0 == p->cr);
|
||||
X(md5INT)(m, p->r1 - p->r0);
|
||||
X(md5INT)(m, p->ci - p->cr);
|
||||
X(md5int)(m, X(ialignment_of)(p->r0));
|
||||
X(md5int)(m, X(ialignment_of)(p->r1));
|
||||
X(md5int)(m, X(ialignment_of)(p->cr));
|
||||
X(md5int)(m, X(ialignment_of)(p->ci));
|
||||
X(md5int)(m, p->kind);
|
||||
X(tensor_md5)(m, p->sz);
|
||||
X(tensor_md5)(m, p->vecsz);
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_rdft2 *ego = (const problem_rdft2 *) ego_;
|
||||
p->print(p, "(rdft2 %d %d %T %T)",
|
||||
(int)(ego->cr == ego->r0),
|
||||
(int)(ego->kind),
|
||||
ego->sz,
|
||||
ego->vecsz);
|
||||
}
|
||||
|
||||
static void recur(const iodim *dims, int rnk, R *I0, R *I1)
|
||||
{
|
||||
if (rnk == RNK_MINFTY)
|
||||
return;
|
||||
else if (rnk == 0)
|
||||
I0[0] = K(0.0);
|
||||
else if (rnk > 0) {
|
||||
INT i, n = dims[0].n, is = dims[0].is;
|
||||
|
||||
if (rnk == 1) {
|
||||
for (i = 0; i < n - 1; i += 2) {
|
||||
*I0 = *I1 = K(0.0);
|
||||
I0 += is; I1 += is;
|
||||
}
|
||||
if (i < n)
|
||||
*I0 = K(0.0);
|
||||
} else {
|
||||
for (i = 0; i < n; ++i)
|
||||
recur(dims + 1, rnk - 1, I0 + i * is, I1 + i * is);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void vrecur(const iodim *vdims, int vrnk,
|
||||
const iodim *dims, int rnk, R *I0, R *I1)
|
||||
{
|
||||
if (vrnk == RNK_MINFTY)
|
||||
return;
|
||||
else if (vrnk == 0)
|
||||
recur(dims, rnk, I0, I1);
|
||||
else if (vrnk > 0) {
|
||||
INT i, n = vdims[0].n, is = vdims[0].is;
|
||||
|
||||
for (i = 0; i < n; ++i)
|
||||
vrecur(vdims + 1, vrnk - 1,
|
||||
dims, rnk, I0 + i * is, I1 + i * is);
|
||||
}
|
||||
}
|
||||
|
||||
INT X(rdft2_complex_n)(INT real_n, rdft_kind kind)
|
||||
{
|
||||
switch (kind) {
|
||||
case R2HC:
|
||||
case HC2R:
|
||||
return (real_n / 2) + 1;
|
||||
case R2HCII:
|
||||
case HC2RIII:
|
||||
return (real_n + 1) / 2;
|
||||
default:
|
||||
/* can't happen */
|
||||
A(0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_rdft2 *ego = (const problem_rdft2 *) ego_;
|
||||
if (R2HC_KINDP(ego->kind)) {
|
||||
/* FIXME: can we avoid the double recursion somehow? */
|
||||
vrecur(ego->vecsz->dims, ego->vecsz->rnk,
|
||||
ego->sz->dims, ego->sz->rnk,
|
||||
UNTAINT(ego->r0), UNTAINT(ego->r1));
|
||||
} else {
|
||||
tensor *sz;
|
||||
tensor *sz2 = X(tensor_copy)(ego->sz);
|
||||
int rnk = sz2->rnk;
|
||||
if (rnk > 0) /* ~half as many complex outputs */
|
||||
sz2->dims[rnk-1].n =
|
||||
X(rdft2_complex_n)(sz2->dims[rnk-1].n, ego->kind);
|
||||
sz = X(tensor_append)(ego->vecsz, sz2);
|
||||
X(tensor_destroy)(sz2);
|
||||
X(dft_zerotens)(sz, UNTAINT(ego->cr), UNTAINT(ego->ci));
|
||||
X(tensor_destroy)(sz);
|
||||
}
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_RDFT2,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz,
|
||||
R *r0, R *r1, R *cr, R *ci,
|
||||
rdft_kind kind)
|
||||
{
|
||||
problem_rdft2 *ego;
|
||||
|
||||
A(kind == R2HC || kind == R2HCII || kind == HC2R || kind == HC2RIII);
|
||||
A(X(tensor_kosherp)(sz));
|
||||
A(X(tensor_kosherp)(vecsz));
|
||||
A(FINITE_RNK(sz->rnk));
|
||||
|
||||
/* require in-place problems to use r0 == cr */
|
||||
if (UNTAINT(r0) == UNTAINT(ci))
|
||||
return X(mkproblem_unsolvable)();
|
||||
|
||||
/* FIXME: should check UNTAINT(r1) == UNTAINT(cr) but
|
||||
only if odd elements exist, which requires compressing the
|
||||
tensors first */
|
||||
|
||||
if (UNTAINT(r0) == UNTAINT(cr))
|
||||
r0 = cr = JOIN_TAINT(r0, cr);
|
||||
|
||||
ego = (problem_rdft2 *)X(mkproblem)(sizeof(problem_rdft2), &padt);
|
||||
|
||||
if (sz->rnk > 1) { /* have to compress rnk-1 dims separately, ugh */
|
||||
tensor *szc = X(tensor_copy_except)(sz, sz->rnk - 1);
|
||||
tensor *szr = X(tensor_copy_sub)(sz, sz->rnk - 1, 1);
|
||||
tensor *szcc = X(tensor_compress)(szc);
|
||||
if (szcc->rnk > 0)
|
||||
ego->sz = X(tensor_append)(szcc, szr);
|
||||
else
|
||||
ego->sz = X(tensor_compress)(szr);
|
||||
X(tensor_destroy2)(szc, szr); X(tensor_destroy)(szcc);
|
||||
} else {
|
||||
ego->sz = X(tensor_compress)(sz);
|
||||
}
|
||||
ego->vecsz = X(tensor_compress_contiguous)(vecsz);
|
||||
ego->r0 = r0;
|
||||
ego->r1 = r1;
|
||||
ego->cr = cr;
|
||||
ego->ci = ci;
|
||||
ego->kind = kind;
|
||||
|
||||
A(FINITE_RNK(ego->sz->rnk));
|
||||
return &(ego->super);
|
||||
|
||||
}
|
||||
|
||||
/* Same as X(mkproblem_rdft2), but also destroy input tensors. */
|
||||
problem *X(mkproblem_rdft2_d)(tensor *sz, tensor *vecsz,
|
||||
R *r0, R *r1, R *cr, R *ci, rdft_kind kind)
|
||||
{
|
||||
problem *p = X(mkproblem_rdft2)(sz, vecsz, r0, r1, cr, ci, kind);
|
||||
X(tensor_destroy2)(vecsz, sz);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Same as X(mkproblem_rdft2_d), but with only one R pointer.
|
||||
Used by the API. */
|
||||
problem *X(mkproblem_rdft2_d_3pointers)(tensor *sz, tensor *vecsz,
|
||||
R *r0, R *cr, R *ci, rdft_kind kind)
|
||||
{
|
||||
problem *p;
|
||||
int rnk = sz->rnk;
|
||||
R *r1;
|
||||
|
||||
if (rnk == 0)
|
||||
r1 = r0;
|
||||
else if (R2HC_KINDP(kind)) {
|
||||
r1 = r0 + sz->dims[rnk-1].is;
|
||||
sz->dims[rnk-1].is *= 2;
|
||||
} else {
|
||||
r1 = r0 + sz->dims[rnk-1].os;
|
||||
sz->dims[rnk-1].os *= 2;
|
||||
}
|
||||
|
||||
p = X(mkproblem_rdft2)(sz, vecsz, r0, r1, cr, ci, kind);
|
||||
X(tensor_destroy2)(vecsz, sz);
|
||||
return p;
|
||||
}
|
||||
238
fftw-3.3.10/rdft/rank-geq2-rdft2.c
Normal file
238
fftw-3.3.10/rdft/rank-geq2-rdft2.c
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for RDFT2 of rank >= 2 (multidimensional) */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int spltrnk;
|
||||
const int *buddies;
|
||||
size_t nbuddies;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_dft super;
|
||||
plan *cldr, *cldc;
|
||||
const S *solver;
|
||||
} P;
|
||||
|
||||
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
{
|
||||
plan_rdft2 *cldr = (plan_rdft2 *) ego->cldr;
|
||||
cldr->apply((plan *) cldr, r0, r1, cr, ci);
|
||||
}
|
||||
|
||||
{
|
||||
plan_dft *cldc = (plan_dft *) ego->cldc;
|
||||
cldc->apply((plan *) cldc, cr, ci, cr, ci);
|
||||
}
|
||||
}
|
||||
|
||||
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
{
|
||||
plan_dft *cldc = (plan_dft *) ego->cldc;
|
||||
cldc->apply((plan *) cldc, ci, cr, ci, cr);
|
||||
}
|
||||
|
||||
{
|
||||
plan_rdft2 *cldr = (plan_rdft2 *) ego->cldr;
|
||||
cldr->apply((plan *) cldr, r0, r1, cr, ci);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldr, wakefulness);
|
||||
X(plan_awake)(ego->cldc, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldr);
|
||||
X(plan_destroy_internal)(ego->cldc);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->solver;
|
||||
p->print(p, "(rdft2-rank>=2/%d%(%p%)%(%p%))",
|
||||
s->spltrnk, ego->cldr, ego->cldc);
|
||||
}
|
||||
|
||||
static int picksplit(const S *ego, const tensor *sz, int *rp)
|
||||
{
|
||||
A(sz->rnk > 1); /* cannot split rnk <= 1 */
|
||||
if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
|
||||
return 0;
|
||||
*rp += 1; /* convert from dim. index to rank */
|
||||
if (*rp >= sz->rnk) /* split must reduce rank */
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_, int *rp,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
const S *ego = (const S *)ego_;
|
||||
return (1
|
||||
&& FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
|
||||
|
||||
/* FIXME: multidimensional R2HCII ? */
|
||||
&& (p->kind == R2HC || p->kind == HC2R)
|
||||
|
||||
&& p->sz->rnk >= 2
|
||||
&& picksplit(ego, p->sz, rp)
|
||||
&& (0
|
||||
|
||||
/* can work out-of-place, but HC2R destroys input */
|
||||
|| (p->r0 != p->cr &&
|
||||
(p->kind == R2HC || !NO_DESTROY_INPUTP(plnr)))
|
||||
|
||||
/* FIXME: what are sufficient conditions for inplace? */
|
||||
|| (p->r0 == p->cr))
|
||||
);
|
||||
}
|
||||
|
||||
/* TODO: revise this. */
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr, int *rp)
|
||||
{
|
||||
const S *ego = (const S *)ego_;
|
||||
|
||||
if (!applicable0(ego_, p_, rp, plnr)) return 0;
|
||||
|
||||
if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0]))
|
||||
return 0;
|
||||
|
||||
if (NO_UGLYP(plnr)) {
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
|
||||
/* Heuristic: if the vector stride is greater than the transform
|
||||
size, don't use (prefer to do the vector loop first with a
|
||||
vrank-geq1 plan). */
|
||||
if (p->vecsz->rnk > 0 &&
|
||||
X(tensor_min_stride)(p->vecsz)
|
||||
> X(rdft2_tensor_max_index)(p->sz, p->kind))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_rdft2 *p;
|
||||
P *pln;
|
||||
plan *cldr = 0, *cldc = 0;
|
||||
tensor *sz1, *sz2, *vecszi, *sz2i;
|
||||
int spltrnk;
|
||||
inplace_kind k;
|
||||
problem *cldp;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr, &spltrnk))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
|
||||
|
||||
k = p->kind == R2HC ? INPLACE_OS : INPLACE_IS;
|
||||
vecszi = X(tensor_copy_inplace)(p->vecsz, k);
|
||||
sz2i = X(tensor_copy_inplace)(sz2, k);
|
||||
|
||||
/* complex data is ~half of real */
|
||||
sz2i->dims[sz2i->rnk - 1].n = sz2i->dims[sz2i->rnk - 1].n/2 + 1;
|
||||
|
||||
cldr = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(X(tensor_copy)(sz2),
|
||||
X(tensor_append)(p->vecsz, sz1),
|
||||
p->r0, p->r1,
|
||||
p->cr, p->ci, p->kind));
|
||||
if (!cldr) goto nada;
|
||||
|
||||
if (p->kind == R2HC)
|
||||
cldp = X(mkproblem_dft_d)(X(tensor_copy_inplace)(sz1, k),
|
||||
X(tensor_append)(vecszi, sz2i),
|
||||
p->cr, p->ci, p->cr, p->ci);
|
||||
else /* HC2R must swap re/im parts to get IDFT */
|
||||
cldp = X(mkproblem_dft_d)(X(tensor_copy_inplace)(sz1, k),
|
||||
X(tensor_append)(vecszi, sz2i),
|
||||
p->ci, p->cr, p->ci, p->cr);
|
||||
cldc = X(mkplan_d)(plnr, cldp);
|
||||
if (!cldc) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, p->kind == R2HC ? apply_r2hc : apply_hc2r);
|
||||
|
||||
pln->cldr = cldr;
|
||||
pln->cldc = cldc;
|
||||
|
||||
pln->solver = ego;
|
||||
X(ops_add)(&cldr->ops, &cldc->ops, &pln->super.super.ops);
|
||||
|
||||
X(tensor_destroy4)(sz2i, vecszi, sz2, sz1);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldr);
|
||||
X(plan_destroy_internal)(cldc);
|
||||
X(tensor_destroy4)(sz2i, vecszi, sz2, sz1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int spltrnk, const int *buddies, size_t nbuddies)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->spltrnk = spltrnk;
|
||||
slv->buddies = buddies;
|
||||
slv->nbuddies = nbuddies;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft2_rank_geq2_register)(planner *p)
|
||||
{
|
||||
static const int buddies[] = { 1, 0, -2 };
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < NELEM(buddies); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(buddies[i], buddies, NELEM(buddies)));
|
||||
|
||||
/* FIXME: Should we try more buddies? See also dft/rank-geq2. */
|
||||
}
|
||||
207
fftw-3.3.10/rdft/rank-geq2.c
Normal file
207
fftw-3.3.10/rdft/rank-geq2.c
Normal file
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for RDFT of rank >= 2 (multidimensional) */
|
||||
|
||||
/* FIXME: this solver cannot strictly be applied to multidimensional
|
||||
DHTs, since the latter are not separable...up to rnk-1 additional
|
||||
post-processing passes may be required. See also:
|
||||
|
||||
R. N. Bracewell, O. Buneman, H. Hao, and J. Villasenor, "Fast
|
||||
two-dimensional Hartley transform," Proc. IEEE 74, 1282-1283 (1986).
|
||||
|
||||
H. Hao and R. N. Bracewell, "A three-dimensional DFT algorithm
|
||||
using the fast Hartley transform," Proc. IEEE 75(2), 264-266 (1987).
|
||||
*/
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int spltrnk;
|
||||
const int *buddies;
|
||||
size_t nbuddies;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
const S *solver;
|
||||
} P;
|
||||
|
||||
/* Compute multi-dimensional RDFT by applying the two cld plans
|
||||
(lower-rnk RDFTs). */
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld1, *cld2;
|
||||
|
||||
cld1 = (plan_rdft *) ego->cld1;
|
||||
cld1->apply(ego->cld1, I, O);
|
||||
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, O, O);
|
||||
}
|
||||
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const S *s = ego->solver;
|
||||
p->print(p, "(rdft-rank>=2/%d%(%p%)%(%p%))",
|
||||
s->spltrnk, ego->cld1, ego->cld2);
|
||||
}
|
||||
|
||||
static int picksplit(const S *ego, const tensor *sz, int *rp)
|
||||
{
|
||||
A(sz->rnk > 1); /* cannot split rnk <= 1 */
|
||||
if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
|
||||
return 0;
|
||||
*rp += 1; /* convert from dim. index to rank */
|
||||
if (*rp >= sz->rnk) /* split must reduce rank */
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_, int *rp)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
const S *ego = (const S *)ego_;
|
||||
return (1
|
||||
&& FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
|
||||
&& p->sz->rnk >= 2
|
||||
&& picksplit(ego, p->sz, rp)
|
||||
);
|
||||
}
|
||||
|
||||
/* TODO: revise this. */
|
||||
static int applicable(const solver *ego_, const problem *p_,
|
||||
const planner *plnr, int *rp)
|
||||
{
|
||||
const S *ego = (const S *)ego_;
|
||||
|
||||
if (!applicable0(ego_, p_, rp)) return 0;
|
||||
|
||||
if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0]))
|
||||
return 0;
|
||||
|
||||
if (NO_UGLYP(plnr)) {
|
||||
/* Heuristic: if the vector stride is greater than the transform
|
||||
sz, don't use (prefer to do the vector loop first with a
|
||||
vrank-geq1 plan). */
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
|
||||
if (p->vecsz->rnk > 0 &&
|
||||
X(tensor_min_stride)(p->vecsz) > X(tensor_max_index)(p->sz))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_rdft *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0;
|
||||
tensor *sz1, *sz2, *vecszi, *sz2i;
|
||||
int spltrnk;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr, &spltrnk))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
|
||||
vecszi = X(tensor_copy_inplace)(p->vecsz, INPLACE_OS);
|
||||
sz2i = X(tensor_copy_inplace)(sz2, INPLACE_OS);
|
||||
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(X(tensor_copy)(sz2),
|
||||
X(tensor_append)(p->vecsz, sz1),
|
||||
p->I, p->O, p->kind + spltrnk));
|
||||
if (!cld1) goto nada;
|
||||
|
||||
cld2 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(tensor_copy_inplace)(sz1, INPLACE_OS),
|
||||
X(tensor_append)(vecszi, sz2i),
|
||||
p->O, p->O, p->kind));
|
||||
if (!cld2) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, apply);
|
||||
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
|
||||
pln->solver = ego;
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
|
||||
X(tensor_destroy4)(sz2, sz1, vecszi, sz2i);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
X(tensor_destroy4)(sz2, sz1, vecszi, sz2i);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int spltrnk, const int *buddies, size_t nbuddies)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->spltrnk = spltrnk;
|
||||
slv->buddies = buddies;
|
||||
slv->nbuddies = nbuddies;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft_rank_geq2_register)(planner *p)
|
||||
{
|
||||
static const int buddies[] = { 1, 0, -2 };
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < NELEM(buddies); ++i)
|
||||
REGISTER_SOLVER(p, mksolver(buddies[i], buddies, NELEM(buddies)));
|
||||
|
||||
/* FIXME: Should we try more buddies? See also dft/rank-geq2. */
|
||||
}
|
||||
199
fftw-3.3.10/rdft/rank0-rdft2.c
Normal file
199
fftw-3.3.10/rdft/rank0-rdft2.c
Normal file
@@ -0,0 +1,199 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for rank-0 RDFT2 (copy operations, plus setting 0 imag. parts) */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h> /* for memcpy() */
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
INT vl;
|
||||
INT ivs, ovs;
|
||||
plan *cldcpy;
|
||||
} P;
|
||||
|
||||
static int applicable(const problem *p_)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk == 0
|
||||
&& (p->kind == HC2R
|
||||
||
|
||||
(1
|
||||
&& p->kind == R2HC
|
||||
|
||||
&& p->vecsz->rnk <= 1
|
||||
|
||||
&& ((p->r0 != p->cr)
|
||||
||
|
||||
X(rdft2_inplace_strides)(p, RNK_MINFTY)) ))
|
||||
);
|
||||
}
|
||||
|
||||
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i, vl = ego->vl;
|
||||
INT ivs = ego->ivs, ovs = ego->ovs;
|
||||
|
||||
UNUSED(r1); /* rank-0 has no real odd-index elements */
|
||||
|
||||
for (i = 4; i <= vl; i += 4) {
|
||||
R x0, x1, x2, x3;
|
||||
x0 = *r0; r0 += ivs;
|
||||
x1 = *r0; r0 += ivs;
|
||||
x2 = *r0; r0 += ivs;
|
||||
x3 = *r0; r0 += ivs;
|
||||
*cr = x0; cr += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
*cr = x1; cr += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
*cr = x2; cr += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
*cr = x3; cr += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
}
|
||||
for (; i < vl + 4; ++i) {
|
||||
R x0;
|
||||
x0 = *r0; r0 += ivs;
|
||||
*cr = x0; cr += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
}
|
||||
}
|
||||
|
||||
/* in-place r2hc rank-0: set imaginary parts of output to 0 */
|
||||
static void apply_r2hc_inplace(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT i, vl = ego->vl;
|
||||
INT ovs = ego->ovs;
|
||||
|
||||
UNUSED(r0); UNUSED(r1); UNUSED(cr);
|
||||
|
||||
for (i = 4; i <= vl; i += 4) {
|
||||
*ci = K(0.0); ci += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
*ci = K(0.0); ci += ovs;
|
||||
}
|
||||
for (; i < vl + 4; ++i) {
|
||||
*ci = K(0.0); ci += ovs;
|
||||
}
|
||||
}
|
||||
|
||||
/* a rank-0 HC2R rdft2 problem is just a copy from cr to r0,
|
||||
so we can use a rank-0 rdft plan */
|
||||
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
|
||||
UNUSED(ci);
|
||||
UNUSED(r1);
|
||||
cldcpy->apply((plan *) cldcpy, cr, r0);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
if (ego->cldcpy)
|
||||
X(plan_awake)(ego->cldcpy, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
if (ego->cldcpy)
|
||||
X(plan_destroy_internal)(ego->cldcpy);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
if (ego->cldcpy)
|
||||
p->print(p, "(rdft2-hc2r-rank0%(%p%))", ego->cldcpy);
|
||||
else
|
||||
p->print(p, "(rdft2-r2hc-rank0%v)", ego->vl);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p;
|
||||
plan *cldcpy = (plan *) 0;
|
||||
P *pln;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego_);
|
||||
|
||||
if (!applicable(p_))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
|
||||
if (p->kind == HC2R) {
|
||||
cldcpy = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(tensor_copy)(p->vecsz),
|
||||
p->cr, p->r0));
|
||||
if (!cldcpy) return (plan *) 0;
|
||||
}
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt,
|
||||
p->kind == R2HC ?
|
||||
(p->r0 == p->cr ? apply_r2hc_inplace : apply_r2hc)
|
||||
: apply_hc2r);
|
||||
|
||||
if (p->kind == R2HC)
|
||||
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
|
||||
pln->cldcpy = cldcpy;
|
||||
|
||||
if (p->kind == R2HC) {
|
||||
/* vl loads, 2*vl stores */
|
||||
X(ops_other)(3 * pln->vl, &pln->super.super.ops);
|
||||
}
|
||||
else {
|
||||
pln->super.super.ops = cldcpy->ops;
|
||||
}
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft2_rank0_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
381
fftw-3.3.10/rdft/rank0.c
Normal file
381
fftw-3.3.10/rdft/rank0.c
Normal file
@@ -0,0 +1,381 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* plans for rank-0 RDFTs (copy operations) */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h> /* for memcpy() */
|
||||
#endif
|
||||
|
||||
#define MAXRNK 32 /* FIXME: should malloc() */
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
INT vl;
|
||||
int rnk;
|
||||
iodim d[MAXRNK];
|
||||
const char *nam;
|
||||
} P;
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
rdftapply apply;
|
||||
int (*applicable)(const P *pln, const problem_rdft *p);
|
||||
const char *nam;
|
||||
} S;
|
||||
|
||||
/* copy up to MAXRNK dimensions from problem into plan. If a
|
||||
contiguous dimension exists, save its length in pln->vl */
|
||||
static int fill_iodim(P *pln, const problem_rdft *p)
|
||||
{
|
||||
int i;
|
||||
const tensor *vecsz = p->vecsz;
|
||||
|
||||
pln->vl = 1;
|
||||
pln->rnk = 0;
|
||||
for (i = 0; i < vecsz->rnk; ++i) {
|
||||
/* extract contiguous dimensions */
|
||||
if (pln->vl == 1 &&
|
||||
vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1)
|
||||
pln->vl = vecsz->dims[i].n;
|
||||
else if (pln->rnk == MAXRNK)
|
||||
return 0;
|
||||
else
|
||||
pln->d[pln->rnk++] = vecsz->dims[i];
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* generic higher-rank copy routine, calls cpy2d() to do the real work */
|
||||
static void copy(const iodim *d, int rnk, INT vl,
|
||||
R *I, R *O,
|
||||
cpy2d_func cpy2d)
|
||||
{
|
||||
A(rnk >= 2);
|
||||
if (rnk == 2)
|
||||
cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl);
|
||||
else {
|
||||
INT i;
|
||||
for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os)
|
||||
copy(d + 1, rnk - 1, vl, I, O, cpy2d);
|
||||
}
|
||||
}
|
||||
|
||||
/* FIXME: should be more general */
|
||||
static int transposep(const P *pln)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < pln->rnk - 2; ++i)
|
||||
if (pln->d[i].is != pln->d[i].os)
|
||||
return 0;
|
||||
|
||||
return (pln->d[i].n == pln->d[i+1].n &&
|
||||
pln->d[i].is == pln->d[i+1].os &&
|
||||
pln->d[i].os == pln->d[i+1].is);
|
||||
}
|
||||
|
||||
/* generic higher-rank transpose routine, calls transpose2d() to do
|
||||
* the real work */
|
||||
static void transpose(const iodim *d, int rnk, INT vl,
|
||||
R *I,
|
||||
transpose_func transpose2d)
|
||||
{
|
||||
A(rnk >= 2);
|
||||
if (rnk == 2)
|
||||
transpose2d(I, d[0].n, d[0].is, d[0].os, vl);
|
||||
else {
|
||||
INT i;
|
||||
for (i = 0; i < d[0].n; ++i, I += d[0].is)
|
||||
transpose(d + 1, rnk - 1, vl, I, transpose2d);
|
||||
}
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* rank 0,1,2, out of place, iterative */
|
||||
static void apply_iter(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
switch (ego->rnk) {
|
||||
case 0:
|
||||
X(cpy1d)(I, O, ego->vl, 1, 1, 1);
|
||||
break;
|
||||
case 1:
|
||||
X(cpy1d)(I, O,
|
||||
ego->d[0].n, ego->d[0].is, ego->d[0].os,
|
||||
ego->vl);
|
||||
break;
|
||||
default:
|
||||
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable_iter(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
UNUSED(pln);
|
||||
return (p->I != p->O);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* out of place, write contiguous output */
|
||||
static void apply_cpy2dco(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co));
|
||||
}
|
||||
|
||||
static int applicable_cpy2dco(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
int rnk = pln->rnk;
|
||||
return (1
|
||||
&& p->I != p->O
|
||||
&& rnk >= 2
|
||||
|
||||
/* must not duplicate apply_iter */
|
||||
&& (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is)
|
||||
||
|
||||
X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os))
|
||||
);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* out of place, tiled, no buffering */
|
||||
static void apply_tiled(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled));
|
||||
}
|
||||
|
||||
static int applicable_tiled(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
return (1
|
||||
&& p->I != p->O
|
||||
&& pln->rnk >= 2
|
||||
|
||||
/* somewhat arbitrary */
|
||||
&& X(compute_tilesz)(pln->vl, 1) > 4
|
||||
);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* out of place, tiled, with buffer */
|
||||
static void apply_tiledbuf(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf));
|
||||
}
|
||||
|
||||
#define applicable_tiledbuf applicable_tiled
|
||||
|
||||
/**************************************************************/
|
||||
/* rank 0, out of place, using memcpy */
|
||||
static void apply_memcpy(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
|
||||
A(ego->rnk == 0);
|
||||
memcpy(O, I, ego->vl * sizeof(R));
|
||||
}
|
||||
|
||||
static int applicable_memcpy(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
return (1
|
||||
&& p->I != p->O
|
||||
&& pln->rnk == 0
|
||||
&& pln->vl > 2 /* do not bother memcpy-ing complex numbers */
|
||||
);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place
|
||||
transposes of vl-tuples ... for large vl it should be more
|
||||
efficient to use memcpy than the tiled stuff). */
|
||||
|
||||
static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O)
|
||||
{
|
||||
INT i, n = d->n, is = d->is, os = d->os;
|
||||
if (rnk == 1)
|
||||
for (i = 0; i < n; ++i, I += is, O += os)
|
||||
memcpy(O, I, cpysz);
|
||||
else {
|
||||
--rnk; ++d;
|
||||
for (i = 0; i < n; ++i, I += is, O += os)
|
||||
memcpy_loop(cpysz, rnk, d, I, O);
|
||||
}
|
||||
}
|
||||
|
||||
static void apply_memcpy_loop(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O);
|
||||
}
|
||||
|
||||
static int applicable_memcpy_loop(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
return (p->I != p->O
|
||||
&& pln->rnk > 0
|
||||
&& pln->vl > 2 /* do not bother memcpy-ing complex numbers */);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* rank 2, in place, square transpose, iterative */
|
||||
static void apply_ip_sq(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
UNUSED(O);
|
||||
transpose(ego->d, ego->rnk, ego->vl, I, X(transpose));
|
||||
}
|
||||
|
||||
|
||||
static int applicable_ip_sq(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
return (1
|
||||
&& p->I == p->O
|
||||
&& pln->rnk >= 2
|
||||
&& transposep(pln));
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* rank 2, in place, square transpose, tiled */
|
||||
static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
UNUSED(O);
|
||||
transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled));
|
||||
}
|
||||
|
||||
static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p)
|
||||
{
|
||||
return (1
|
||||
&& applicable_ip_sq(pln, p)
|
||||
|
||||
/* somewhat arbitrary */
|
||||
&& X(compute_tilesz)(pln->vl, 2) > 4
|
||||
);
|
||||
}
|
||||
|
||||
/**************************************************************/
|
||||
/* rank 2, in place, square transpose, tiled, buffered */
|
||||
static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
UNUSED(O);
|
||||
transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf));
|
||||
}
|
||||
|
||||
#define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled
|
||||
|
||||
/**************************************************************/
|
||||
static int applicable(const S *ego, const problem *p_)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
P pln;
|
||||
return (1
|
||||
&& p->sz->rnk == 0
|
||||
&& FINITE_RNK(p->vecsz->rnk)
|
||||
&& fill_iodim(&pln, p)
|
||||
&& ego->applicable(&pln, p)
|
||||
);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
int i;
|
||||
p->print(p, "(%s/%D", ego->nam, ego->vl);
|
||||
for (i = 0; i < ego->rnk; ++i)
|
||||
p->print(p, "%v", ego->d[i].n);
|
||||
p->print(p, ")");
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_rdft *p;
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
int retval;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), X(null_awake), print, X(plan_null_destroy)
|
||||
};
|
||||
|
||||
UNUSED(plnr);
|
||||
|
||||
if (!applicable(ego, p_))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
pln = MKPLAN_RDFT(P, &padt, ego->apply);
|
||||
|
||||
retval = fill_iodim(pln, p);
|
||||
(void)retval; /* UNUSED unless DEBUG */
|
||||
A(retval);
|
||||
A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */
|
||||
pln->nam = ego->nam;
|
||||
|
||||
/* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */
|
||||
X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops);
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
|
||||
void X(rdft_rank0_register)(planner *p)
|
||||
{
|
||||
unsigned i;
|
||||
static struct {
|
||||
rdftapply apply;
|
||||
int (*applicable)(const P *, const problem_rdft *);
|
||||
const char *nam;
|
||||
} tab[] = {
|
||||
{ apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" },
|
||||
{ apply_memcpy_loop, applicable_memcpy_loop,
|
||||
"rdft-rank0-memcpy-loop" },
|
||||
{ apply_iter, applicable_iter, "rdft-rank0-iter-ci" },
|
||||
{ apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" },
|
||||
{ apply_tiled, applicable_tiled, "rdft-rank0-tiled" },
|
||||
{ apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" },
|
||||
{ apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" },
|
||||
{
|
||||
apply_ip_sq_tiled,
|
||||
applicable_ip_sq_tiled,
|
||||
"rdft-rank0-ip-sq-tiled"
|
||||
},
|
||||
{
|
||||
apply_ip_sq_tiledbuf,
|
||||
applicable_ip_sq_tiledbuf,
|
||||
"rdft-rank0-ip-sq-tiledbuf"
|
||||
},
|
||||
};
|
||||
|
||||
for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) {
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->apply = tab[i].apply;
|
||||
slv->applicable = tab[i].applicable;
|
||||
slv->nam = tab[i].nam;
|
||||
REGISTER_SOLVER(p, &(slv->super));
|
||||
}
|
||||
}
|
||||
220
fftw-3.3.10/rdft/rdft-dht.c
Normal file
220
fftw-3.3.10/rdft/rdft-dht.c
Normal file
@@ -0,0 +1,220 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* Solve an R2HC/HC2R problem via post/pre processing of a DHT. This
|
||||
is mainly useful because we can use Rader to compute DHTs of prime
|
||||
sizes. It also allows us to express hc2r problems in terms of r2hc
|
||||
(via dht-r2hc), and to do hc2r problems without destroying the input. */
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft super;
|
||||
plan *cld;
|
||||
INT is, os;
|
||||
INT n;
|
||||
} P;
|
||||
|
||||
static void apply_r2hc(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT os;
|
||||
INT i, n;
|
||||
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply((plan *) cld, I, O);
|
||||
}
|
||||
|
||||
n = ego->n;
|
||||
os = ego->os;
|
||||
for (i = 1; i < n - i; ++i) {
|
||||
E a, b;
|
||||
a = K(0.5) * O[os * i];
|
||||
b = K(0.5) * O[os * (n - i)];
|
||||
O[os * i] = a + b;
|
||||
#if FFT_SIGN == -1
|
||||
O[os * (n - i)] = b - a;
|
||||
#else
|
||||
O[os * (n - i)] = a - b;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* hc2r, destroying input as usual */
|
||||
static void apply_hc2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT is = ego->is;
|
||||
INT i, n = ego->n;
|
||||
|
||||
for (i = 1; i < n - i; ++i) {
|
||||
E a, b;
|
||||
a = I[is * i];
|
||||
b = I[is * (n - i)];
|
||||
#if FFT_SIGN == -1
|
||||
I[is * i] = a - b;
|
||||
I[is * (n - i)] = a + b;
|
||||
#else
|
||||
I[is * i] = a + b;
|
||||
I[is * (n - i)] = a - b;
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply((plan *) cld, I, O);
|
||||
}
|
||||
}
|
||||
|
||||
/* hc2r, without destroying input */
|
||||
static void apply_hc2r_save(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
INT is = ego->is, os = ego->os;
|
||||
INT i, n = ego->n;
|
||||
|
||||
O[0] = I[0];
|
||||
for (i = 1; i < n - i; ++i) {
|
||||
E a, b;
|
||||
a = I[is * i];
|
||||
b = I[is * (n - i)];
|
||||
#if FFT_SIGN == -1
|
||||
O[os * i] = a - b;
|
||||
O[os * (n - i)] = a + b;
|
||||
#else
|
||||
O[os * i] = a + b;
|
||||
O[os * (n - i)] = a - b;
|
||||
#endif
|
||||
}
|
||||
if (i == n - i)
|
||||
O[os * i] = I[is * i];
|
||||
|
||||
{
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply((plan *) cld, O, O);
|
||||
}
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(%s-dht-%D%(%p%))",
|
||||
ego->super.apply == apply_r2hc ? "r2hc" : "hc2r",
|
||||
ego->n, ego->cld);
|
||||
}
|
||||
|
||||
static int applicable0(const solver *ego_, const problem *p_)
|
||||
{
|
||||
const problem_rdft *p = (const problem_rdft *) p_;
|
||||
UNUSED(ego_);
|
||||
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& p->vecsz->rnk == 0
|
||||
&& (p->kind[0] == R2HC || p->kind[0] == HC2R)
|
||||
|
||||
/* hack: size-2 DHT etc. are defined as being equivalent
|
||||
to size-2 R2HC in problem.c, so we need this to prevent
|
||||
infinite loops for size 2 in EXHAUSTIVE mode: */
|
||||
&& p->sz->dims[0].n > 2
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const solver *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
return (!NO_SLOWP(plnr) && applicable0(ego, p_));
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
P *pln;
|
||||
const problem_rdft *p;
|
||||
problem *cldp;
|
||||
plan *cld;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego_, p_, plnr))
|
||||
return (plan *)0;
|
||||
|
||||
p = (const problem_rdft *) p_;
|
||||
|
||||
if (p->kind[0] == R2HC || !NO_DESTROY_INPUTP(plnr))
|
||||
cldp = X(mkproblem_rdft_1)(p->sz, p->vecsz, p->I, p->O, DHT);
|
||||
else {
|
||||
tensor *sz = X(tensor_copy_inplace)(p->sz, INPLACE_OS);
|
||||
cldp = X(mkproblem_rdft_1)(sz, p->vecsz, p->O, p->O, DHT);
|
||||
X(tensor_destroy)(sz);
|
||||
}
|
||||
cld = X(mkplan_d)(plnr, cldp);
|
||||
if (!cld) return (plan *)0;
|
||||
|
||||
pln = MKPLAN_RDFT(P, &padt, p->kind[0] == R2HC ?
|
||||
apply_r2hc : (NO_DESTROY_INPUTP(plnr) ?
|
||||
apply_hc2r_save : apply_hc2r));
|
||||
pln->n = p->sz->dims[0].n;
|
||||
pln->is = p->sz->dims[0].is;
|
||||
pln->os = p->sz->dims[0].os;
|
||||
pln->cld = cld;
|
||||
|
||||
pln->super.super.ops = cld->ops;
|
||||
pln->super.super.ops.other += 4 * ((pln->n - 1)/2);
|
||||
pln->super.super.ops.add += 2 * ((pln->n - 1)/2);
|
||||
if (p->kind[0] == R2HC)
|
||||
pln->super.super.ops.mul += 2 * ((pln->n - 1)/2);
|
||||
if (pln->super.apply == apply_hc2r_save)
|
||||
pln->super.super.ops.other += 2 + (pln->n % 2 ? 0 : 2);
|
||||
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft_dht_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
176
fftw-3.3.10/rdft/rdft.h
Normal file
176
fftw-3.3.10/rdft/rdft.h
Normal file
@@ -0,0 +1,176 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __RDFT_H__
|
||||
#define __RDFT_H__
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* problem.c: */
|
||||
typedef struct {
|
||||
problem super;
|
||||
tensor *sz, *vecsz;
|
||||
R *I, *O;
|
||||
#if defined(STRUCT_HACK_KR)
|
||||
rdft_kind kind[1];
|
||||
#elif defined(STRUCT_HACK_C99)
|
||||
rdft_kind kind[];
|
||||
#else
|
||||
rdft_kind *kind;
|
||||
#endif
|
||||
} problem_rdft;
|
||||
|
||||
void X(rdft_zerotens)(tensor *sz, R *I);
|
||||
problem *X(mkproblem_rdft)(const tensor *sz, const tensor *vecsz,
|
||||
R *I, R *O, const rdft_kind *kind);
|
||||
problem *X(mkproblem_rdft_d)(tensor *sz, tensor *vecsz,
|
||||
R *I, R *O, const rdft_kind *kind);
|
||||
problem *X(mkproblem_rdft_0_d)(tensor *vecsz, R *I, R *O);
|
||||
problem *X(mkproblem_rdft_1)(const tensor *sz, const tensor *vecsz,
|
||||
R *I, R *O, rdft_kind kind);
|
||||
problem *X(mkproblem_rdft_1_d)(tensor *sz, tensor *vecsz,
|
||||
R *I, R *O, rdft_kind kind);
|
||||
|
||||
const char *X(rdft_kind_str)(rdft_kind kind);
|
||||
|
||||
/* solve.c: */
|
||||
void X(rdft_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plan.c: */
|
||||
typedef void (*rdftapply) (const plan *ego, R *I, R *O);
|
||||
|
||||
typedef struct {
|
||||
plan super;
|
||||
rdftapply apply;
|
||||
} plan_rdft;
|
||||
|
||||
plan *X(mkplan_rdft)(size_t size, const plan_adt *adt, rdftapply apply);
|
||||
|
||||
#define MKPLAN_RDFT(type, adt, apply) \
|
||||
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
|
||||
|
||||
/* various solvers */
|
||||
|
||||
solver *X(mksolver_rdft_r2c_direct)(kr2c k, const kr2c_desc *desc);
|
||||
solver *X(mksolver_rdft_r2c_directbuf)(kr2c k, const kr2c_desc *desc);
|
||||
solver *X(mksolver_rdft_r2r_direct)(kr2r k, const kr2r_desc *desc);
|
||||
|
||||
void X(rdft_rank0_register)(planner *p);
|
||||
void X(rdft_vrank3_transpose_register)(planner *p);
|
||||
void X(rdft_rank_geq2_register)(planner *p);
|
||||
void X(rdft_indirect_register)(planner *p);
|
||||
void X(rdft_vrank_geq1_register)(planner *p);
|
||||
void X(rdft_buffered_register)(planner *p);
|
||||
void X(rdft_generic_register)(planner *p);
|
||||
void X(rdft_rader_hc2hc_register)(planner *p);
|
||||
void X(rdft_dht_register)(planner *p);
|
||||
void X(dht_r2hc_register)(planner *p);
|
||||
void X(dht_rader_register)(planner *p);
|
||||
void X(dft_r2hc_register)(planner *p);
|
||||
void X(rdft_nop_register)(planner *p);
|
||||
void X(hc2hc_generic_register)(planner *p);
|
||||
|
||||
/****************************************************************************/
|
||||
/* problem2.c: */
|
||||
/*
|
||||
An RDFT2 problem transforms a 1d real array r[n] with stride is/os
|
||||
to/from an "unpacked" complex array {rio,iio}[n/2 + 1] with stride
|
||||
os/is. R0 points to the first even element of the real array.
|
||||
R1 points to the first odd element of the real array.
|
||||
|
||||
Strides on the real side of the transform express distances
|
||||
between consecutive elements of the same array (even or odd).
|
||||
E.g., for a contiguous input
|
||||
|
||||
R0 R1 R2 R3 ...
|
||||
|
||||
the input stride would be 2, not 1. This convention is necessary
|
||||
for hc2c codelets to work, since they transpose even/odd with
|
||||
real/imag.
|
||||
|
||||
Multidimensional transforms use complex DFTs for the
|
||||
noncontiguous dimensions. vecsz has the usual interpretation.
|
||||
*/
|
||||
typedef struct {
|
||||
problem super;
|
||||
tensor *sz;
|
||||
tensor *vecsz;
|
||||
R *r0, *r1;
|
||||
R *cr, *ci;
|
||||
rdft_kind kind; /* assert(kind < DHT) */
|
||||
} problem_rdft2;
|
||||
|
||||
problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz,
|
||||
R *r0, R *r1, R *cr, R *ci, rdft_kind kind);
|
||||
problem *X(mkproblem_rdft2_d)(tensor *sz, tensor *vecsz,
|
||||
R *r0, R *r1, R *cr, R *ci, rdft_kind kind);
|
||||
problem *X(mkproblem_rdft2_d_3pointers)(tensor *sz, tensor *vecsz,
|
||||
R *r, R *cr, R *ci, rdft_kind kind);
|
||||
int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim);
|
||||
INT X(rdft2_tensor_max_index)(const tensor *sz, rdft_kind k);
|
||||
void X(rdft2_strides)(rdft_kind kind, const iodim *d, INT *rs, INT *cs);
|
||||
INT X(rdft2_complex_n)(INT real_n, rdft_kind kind);
|
||||
|
||||
/* verify.c: */
|
||||
void X(rdft2_verify)(plan *pln, const problem_rdft2 *p, int rounds);
|
||||
|
||||
/* solve.c: */
|
||||
void X(rdft2_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plan.c: */
|
||||
typedef void (*rdft2apply) (const plan *ego, R *r0, R *r1, R *cr, R *ci);
|
||||
|
||||
typedef struct {
|
||||
plan super;
|
||||
rdft2apply apply;
|
||||
} plan_rdft2;
|
||||
|
||||
plan *X(mkplan_rdft2)(size_t size, const plan_adt *adt, rdft2apply apply);
|
||||
|
||||
#define MKPLAN_RDFT2(type, adt, apply) \
|
||||
(type *)X(mkplan_rdft2)(sizeof(type), adt, apply)
|
||||
|
||||
/* various solvers */
|
||||
|
||||
solver *X(mksolver_rdft2_direct)(kr2c k, const kr2c_desc *desc);
|
||||
|
||||
void X(rdft2_vrank_geq1_register)(planner *p);
|
||||
void X(rdft2_buffered_register)(planner *p);
|
||||
void X(rdft2_rdft_register)(planner *p);
|
||||
void X(rdft2_nop_register)(planner *p);
|
||||
void X(rdft2_rank0_register)(planner *p);
|
||||
void X(rdft2_rank_geq2_register)(planner *p);
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* configurations */
|
||||
void X(rdft_conf_standard)(planner *p);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* __RDFT_H__ */
|
||||
64
fftw-3.3.10/rdft/rdft2-inplace-strides.c
Normal file
64
fftw-3.3.10/rdft/rdft2-inplace-strides.c
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
/* Check if the vecsz/sz strides are consistent with the problem
|
||||
being in-place for vecsz.dim[vdim], or for all dimensions
|
||||
if vdim == RNK_MINFTY. We can't just use tensor_inplace_strides
|
||||
because rdft transforms have the unfortunate property of
|
||||
differing input and output sizes. This routine is not
|
||||
exhaustive; we only return 1 for the most common case. */
|
||||
int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim)
|
||||
{
|
||||
INT N, Nc;
|
||||
INT rs, cs;
|
||||
int i;
|
||||
|
||||
for (i = 0; i + 1 < p->sz->rnk; ++i)
|
||||
if (p->sz->dims[i].is != p->sz->dims[i].os)
|
||||
return 0;
|
||||
|
||||
if (!FINITE_RNK(p->vecsz->rnk) || p->vecsz->rnk == 0)
|
||||
return 1;
|
||||
if (!FINITE_RNK(vdim)) { /* check all vector dimensions */
|
||||
for (vdim = 0; vdim < p->vecsz->rnk; ++vdim)
|
||||
if (!X(rdft2_inplace_strides)(p, vdim))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
A(vdim < p->vecsz->rnk);
|
||||
if (p->sz->rnk == 0)
|
||||
return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os);
|
||||
|
||||
N = X(tensor_sz)(p->sz);
|
||||
Nc = (N / p->sz->dims[p->sz->rnk-1].n) *
|
||||
(p->sz->dims[p->sz->rnk-1].n/2 + 1);
|
||||
X(rdft2_strides)(p->kind, p->sz->dims + p->sz->rnk - 1, &rs, &cs);
|
||||
|
||||
/* the factor of 2 comes from the fact that RS is the stride
|
||||
of p->r0 and p->r1, which is twice as large as the strides
|
||||
in the r2r case */
|
||||
return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os
|
||||
&& (X(iabs)(2 * p->vecsz->dims[vdim].os)
|
||||
>= X(imax)(2 * Nc * X(iabs)(cs), N * X(iabs)(rs))));
|
||||
}
|
||||
328
fftw-3.3.10/rdft/rdft2-rdft.c
Normal file
328
fftw-3.3.10/rdft/rdft2-rdft.c
Normal file
@@ -0,0 +1,328 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_rdft2 super;
|
||||
|
||||
plan *cld, *cldrest;
|
||||
INT n, vl, nbuf, bufdist;
|
||||
INT cs, ivs, ovs;
|
||||
} P;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
/* FIXME: have alternate copy functions that push a vector loop inside
|
||||
the n loops? */
|
||||
|
||||
/* copy halfcomplex array r (contiguous) to complex (strided) array rio/iio. */
|
||||
static void hc2c(INT n, R *r, R *rio, R *iio, INT os)
|
||||
{
|
||||
INT i;
|
||||
|
||||
rio[0] = r[0];
|
||||
iio[0] = 0;
|
||||
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
rio[i * os] = r[i];
|
||||
iio[i * os] = r[n - i];
|
||||
}
|
||||
|
||||
if (i + i == n) { /* store the Nyquist frequency */
|
||||
rio[i * os] = r[i];
|
||||
iio[i * os] = K(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
/* reverse of hc2c */
|
||||
static void c2hc(INT n, R *rio, R *iio, INT is, R *r)
|
||||
{
|
||||
INT i;
|
||||
|
||||
r[0] = rio[0];
|
||||
|
||||
for (i = 1; i + i < n; ++i) {
|
||||
r[i] = rio[i * is];
|
||||
r[n - i] = iio[i * is];
|
||||
}
|
||||
|
||||
if (i + i == n) /* store the Nyquist frequency */
|
||||
r[i] = rio[i * is];
|
||||
}
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
INT i, j, vl = ego->vl, nbuf = ego->nbuf, bufdist = ego->bufdist;
|
||||
INT n = ego->n;
|
||||
INT ivs = ego->ivs, ovs = ego->ovs, os = ego->cs;
|
||||
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
|
||||
plan_rdft2 *cldrest;
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* transform to bufs: */
|
||||
cld->apply((plan *) cld, r0, bufs);
|
||||
r0 += ivs * nbuf; r1 += ivs * nbuf;
|
||||
|
||||
/* copy back */
|
||||
for (j = 0; j < nbuf; ++j, cr += ovs, ci += ovs)
|
||||
hc2c(n, bufs + j*bufdist, cr, ci, os);
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_rdft2 *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
|
||||
}
|
||||
|
||||
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
INT i, j, vl = ego->vl, nbuf = ego->nbuf, bufdist = ego->bufdist;
|
||||
INT n = ego->n;
|
||||
INT ivs = ego->ivs, ovs = ego->ovs, is = ego->cs;
|
||||
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
|
||||
plan_rdft2 *cldrest;
|
||||
|
||||
for (i = nbuf; i <= vl; i += nbuf) {
|
||||
/* copy to bufs */
|
||||
for (j = 0; j < nbuf; ++j, cr += ivs, ci += ivs)
|
||||
c2hc(n, cr, ci, is, bufs + j*bufdist);
|
||||
|
||||
/* transform back: */
|
||||
cld->apply((plan *) cld, bufs, r0);
|
||||
r0 += ovs * nbuf; r1 += ovs * nbuf;
|
||||
}
|
||||
|
||||
X(ifree)(bufs);
|
||||
|
||||
/* Do the remaining transforms, if any: */
|
||||
cldrest = (plan_rdft2 *) ego->cldrest;
|
||||
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldrest, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldrest);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(rdft2-rdft-%s-%D%v/%D-%D%(%p%)%(%p%))",
|
||||
ego->super.apply == apply_r2hc ? "r2hc" : "hc2r",
|
||||
ego->n, ego->nbuf,
|
||||
ego->vl, ego->bufdist % ego->n,
|
||||
ego->cld, ego->cldrest);
|
||||
}
|
||||
|
||||
static INT min_nbuf(const problem_rdft2 *p, INT n, INT vl)
|
||||
{
|
||||
INT is, os, ivs, ovs;
|
||||
|
||||
if (p->r0 != p->cr)
|
||||
return 1;
|
||||
if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
|
||||
return 1;
|
||||
A(p->vecsz->rnk == 1); /* rank 0 and MINFTY are inplace */
|
||||
|
||||
X(rdft2_strides)(p->kind, p->sz->dims, &is, &os);
|
||||
X(rdft2_strides)(p->kind, p->vecsz->dims, &ivs, &ovs);
|
||||
|
||||
/* handle one potentially common case: "contiguous" real and
|
||||
complex arrays, which overlap because of the differing sizes. */
|
||||
if (n * X(iabs)(is) <= X(iabs)(ivs)
|
||||
&& (n/2 + 1) * X(iabs)(os) <= X(iabs)(ovs)
|
||||
&& ( ((p->cr - p->ci) <= X(iabs)(os)) ||
|
||||
((p->ci - p->cr) <= X(iabs)(os)) )
|
||||
&& ivs > 0 && ovs > 0) {
|
||||
INT vsmin = X(imin)(ivs, ovs);
|
||||
INT vsmax = X(imax)(ivs, ovs);
|
||||
return(((vsmax - vsmin) * vl + vsmin - 1) / vsmin);
|
||||
}
|
||||
|
||||
return vl; /* punt: just buffer the whole vector */
|
||||
}
|
||||
|
||||
static int applicable0(const problem *p_, const S *ego, const planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
UNUSED(ego);
|
||||
return(1
|
||||
&& p->vecsz->rnk <= 1
|
||||
&& p->sz->rnk == 1
|
||||
|
||||
/* FIXME: does it make sense to do R2HCII ? */
|
||||
&& (p->kind == R2HC || p->kind == HC2R)
|
||||
|
||||
/* real strides must allow for reduction to rdft */
|
||||
&& (2 * (p->r1 - p->r0) ==
|
||||
(((p->kind == R2HC) ? p->sz->dims[0].is : p->sz->dims[0].os)))
|
||||
|
||||
&& !(X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
|
||||
);
|
||||
}
|
||||
|
||||
static int applicable(const problem *p_, const S *ego, const planner *plnr)
|
||||
{
|
||||
const problem_rdft2 *p;
|
||||
|
||||
if (NO_BUFFERINGP(plnr)) return 0;
|
||||
|
||||
if (!applicable0(p_, ego, plnr)) return 0;
|
||||
|
||||
p = (const problem_rdft2 *) p_;
|
||||
if (NO_UGLYP(plnr)) {
|
||||
if (p->r0 != p->cr) return 0;
|
||||
if (X(toobig)(p->sz->dims[0].n)) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
P *pln;
|
||||
plan *cld = (plan *) 0;
|
||||
plan *cldrest = (plan *) 0;
|
||||
const problem_rdft2 *p = (const problem_rdft2 *) p_;
|
||||
R *bufs = (R *) 0;
|
||||
INT nbuf = 0, bufdist, n, vl;
|
||||
INT ivs, ovs, rs, id, od;
|
||||
|
||||
static const plan_adt padt = {
|
||||
X(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(p_, ego, plnr))
|
||||
goto nada;
|
||||
|
||||
n = p->sz->dims[0].n;
|
||||
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
|
||||
|
||||
nbuf = X(imax)(X(nbuf)(n, vl, 0), min_nbuf(p, n, vl));
|
||||
bufdist = X(bufdist)(n, vl);
|
||||
A(nbuf > 0);
|
||||
|
||||
/* initial allocation for the purpose of planning */
|
||||
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
|
||||
|
||||
id = ivs * (nbuf * (vl / nbuf));
|
||||
od = ovs * (nbuf * (vl / nbuf));
|
||||
|
||||
if (p->kind == R2HC) {
|
||||
cld = X(mkplan_f_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(mktensor_1d)(n, p->sz->dims[0].is/2, 1),
|
||||
X(mktensor_1d)(nbuf, ivs, bufdist),
|
||||
TAINT(p->r0, ivs * nbuf), bufs, &p->kind),
|
||||
0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
|
||||
if (!cld) goto nada;
|
||||
X(ifree)(bufs); bufs = 0;
|
||||
|
||||
cldrest = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(
|
||||
X(tensor_copy)(p->sz),
|
||||
X(mktensor_1d)(vl % nbuf, ivs, ovs),
|
||||
p->r0 + id, p->r1 + id,
|
||||
p->cr + od, p->ci + od,
|
||||
p->kind));
|
||||
if (!cldrest) goto nada;
|
||||
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
|
||||
} else {
|
||||
A(p->kind == HC2R);
|
||||
cld = X(mkplan_f_d)(
|
||||
plnr,
|
||||
X(mkproblem_rdft_d)(
|
||||
X(mktensor_1d)(n, 1, p->sz->dims[0].os/2),
|
||||
X(mktensor_1d)(nbuf, bufdist, ovs),
|
||||
bufs, TAINT(p->r0, ovs * nbuf), &p->kind),
|
||||
0, 0, NO_DESTROY_INPUT); /* always ok to destroy bufs */
|
||||
if (!cld) goto nada;
|
||||
X(ifree)(bufs); bufs = 0;
|
||||
|
||||
cldrest = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(
|
||||
X(tensor_copy)(p->sz),
|
||||
X(mktensor_1d)(vl % nbuf, ivs, ovs),
|
||||
p->r0 + od, p->r1 + od,
|
||||
p->cr + id, p->ci + id,
|
||||
p->kind));
|
||||
if (!cldrest) goto nada;
|
||||
pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
|
||||
}
|
||||
|
||||
pln->cld = cld;
|
||||
pln->cldrest = cldrest;
|
||||
pln->n = n;
|
||||
pln->vl = vl;
|
||||
pln->ivs = ivs;
|
||||
pln->ovs = ovs;
|
||||
X(rdft2_strides)(p->kind, &p->sz->dims[0], &rs, &pln->cs);
|
||||
pln->nbuf = nbuf;
|
||||
pln->bufdist = bufdist;
|
||||
|
||||
X(ops_madd)(vl / nbuf, &cld->ops, &cldrest->ops,
|
||||
&pln->super.super.ops);
|
||||
pln->super.super.ops.other += (p->kind == R2HC ? (n + 2) : n) * vl;
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(ifree0)(bufs);
|
||||
X(plan_destroy_internal)(cldrest);
|
||||
X(plan_destroy_internal)(cld);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void X(rdft2_rdft_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
38
fftw-3.3.10/rdft/rdft2-strides.c
Normal file
38
fftw-3.3.10/rdft/rdft2-strides.c
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
/* Deal with annoyance because the tensor (is,os) applies to
|
||||
(r,rio/iio) for R2HC and vice-versa for HC2R. We originally had
|
||||
(is,os) always apply to (r,rio/iio), but this causes other
|
||||
headaches with the tensor functions. */
|
||||
void X(rdft2_strides)(rdft_kind kind, const iodim *d, INT *rs, INT *cs)
|
||||
{
|
||||
if (kind == R2HC) {
|
||||
*rs = d->is;
|
||||
*cs = d->os;
|
||||
}
|
||||
else {
|
||||
A(kind == HC2R);
|
||||
*rs = d->os;
|
||||
*cs = d->is;
|
||||
}
|
||||
}
|
||||
43
fftw-3.3.10/rdft/rdft2-tensor-max-index.c
Normal file
43
fftw-3.3.10/rdft/rdft2-tensor-max-index.c
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
/* like X(tensor_max_index), but takes into account the special n/2+1
|
||||
final dimension for the complex output/input of an R2HC/HC2R transform. */
|
||||
INT X(rdft2_tensor_max_index)(const tensor *sz, rdft_kind k)
|
||||
{
|
||||
int i;
|
||||
INT n = 0;
|
||||
|
||||
A(FINITE_RNK(sz->rnk));
|
||||
for (i = 0; i + 1 < sz->rnk; ++i) {
|
||||
const iodim *p = sz->dims + i;
|
||||
n += (p->n - 1) * X(imax)(X(iabs)(p->is), X(iabs)(p->os));
|
||||
}
|
||||
if (i < sz->rnk) {
|
||||
const iodim *p = sz->dims + i;
|
||||
INT is, os;
|
||||
X(rdft2_strides)(k, p, &is, &os);
|
||||
n += X(imax)((p->n - 1) * X(iabs)(is), (p->n/2) * X(iabs)(os));
|
||||
}
|
||||
return n;
|
||||
}
|
||||
7
fftw-3.3.10/rdft/scalar/Makefile.am
Normal file
7
fftw-3.3.10/rdft/scalar/Makefile.am
Normal file
@@ -0,0 +1,7 @@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = r2cf r2cb r2r
|
||||
noinst_LTLIBRARIES = librdft_scalar.la
|
||||
|
||||
librdft_scalar_la_SOURCES = hb.h r2cb.h r2cbIII.h hf.h hfb.c r2c.c \
|
||||
r2cf.h r2cfII.h r2r.c r2r.h hc2c.c hc2cf.h hc2cb.h
|
||||
|
||||
766
fftw-3.3.10/rdft/scalar/Makefile.in
Normal file
766
fftw-3.3.10/rdft/scalar/Makefile.in
Normal file
@@ -0,0 +1,766 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = rdft/scalar
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
librdft_scalar_la_LIBADD =
|
||||
am_librdft_scalar_la_OBJECTS = hfb.lo r2c.lo r2r.lo hc2c.lo
|
||||
librdft_scalar_la_OBJECTS = $(am_librdft_scalar_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/hc2c.Plo ./$(DEPDIR)/hfb.Plo \
|
||||
./$(DEPDIR)/r2c.Plo ./$(DEPDIR)/r2r.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(librdft_scalar_la_SOURCES)
|
||||
DIST_SOURCES = $(librdft_scalar_la_SOURCES)
|
||||
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
|
||||
ctags-recursive dvi-recursive html-recursive info-recursive \
|
||||
install-data-recursive install-dvi-recursive \
|
||||
install-exec-recursive install-html-recursive \
|
||||
install-info-recursive install-pdf-recursive \
|
||||
install-ps-recursive install-recursive installcheck-recursive \
|
||||
installdirs-recursive pdf-recursive ps-recursive \
|
||||
tags-recursive uninstall-recursive
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
|
||||
distclean-recursive maintainer-clean-recursive
|
||||
am__recursive_targets = \
|
||||
$(RECURSIVE_TARGETS) \
|
||||
$(RECURSIVE_CLEAN_TARGETS) \
|
||||
$(am__extra_recursive_targets)
|
||||
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
|
||||
distdir distdir-am
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
DIST_SUBDIRS = $(SUBDIRS)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
am__relativize = \
|
||||
dir0=`pwd`; \
|
||||
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
|
||||
sed_rest='s,^[^/]*/*,,'; \
|
||||
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
|
||||
sed_butlast='s,/*[^/]*$$,,'; \
|
||||
while test -n "$$dir1"; do \
|
||||
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
|
||||
if test "$$first" != "."; then \
|
||||
if test "$$first" = ".."; then \
|
||||
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
|
||||
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
|
||||
else \
|
||||
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
|
||||
if test "$$first2" = "$$first"; then \
|
||||
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
|
||||
else \
|
||||
dir2="../$$dir2"; \
|
||||
fi; \
|
||||
dir0="$$dir0"/"$$first"; \
|
||||
fi; \
|
||||
fi; \
|
||||
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
|
||||
done; \
|
||||
reldir="$$dir2"
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
SUBDIRS = r2cf r2cb r2r
|
||||
noinst_LTLIBRARIES = librdft_scalar.la
|
||||
librdft_scalar_la_SOURCES = hb.h r2cb.h r2cbIII.h hf.h hfb.c r2c.c \
|
||||
r2cf.h r2cfII.h r2r.c r2r.h hc2c.c hc2cf.h hc2cb.h
|
||||
|
||||
all: all-recursive
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/scalar/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu rdft/scalar/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
librdft_scalar.la: $(librdft_scalar_la_OBJECTS) $(librdft_scalar_la_DEPENDENCIES) $(EXTRA_librdft_scalar_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(librdft_scalar_la_OBJECTS) $(librdft_scalar_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2c.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hfb.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2c.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2r.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
# This directory's subdirectories are mostly independent; you can cd
|
||||
# into them and run 'make' without going through this Makefile.
|
||||
# To change the values of 'make' variables: instead of editing Makefiles,
|
||||
# (1) if the variable is set in 'config.status', edit 'config.status'
|
||||
# (which will cause the Makefiles to be regenerated when you run 'make');
|
||||
# (2) otherwise, pass the desired values on the 'make' command line.
|
||||
$(am__recursive_targets):
|
||||
@fail=; \
|
||||
if $(am__make_keepgoing); then \
|
||||
failcom='fail=yes'; \
|
||||
else \
|
||||
failcom='exit 1'; \
|
||||
fi; \
|
||||
dot_seen=no; \
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
case "$@" in \
|
||||
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
|
||||
*) list='$(SUBDIRS)' ;; \
|
||||
esac; \
|
||||
for subdir in $$list; do \
|
||||
echo "Making $$target in $$subdir"; \
|
||||
if test "$$subdir" = "."; then \
|
||||
dot_seen=yes; \
|
||||
local_target="$$target-am"; \
|
||||
else \
|
||||
local_target="$$target"; \
|
||||
fi; \
|
||||
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|
||||
|| eval $$failcom; \
|
||||
done; \
|
||||
if test "$$dot_seen" = "no"; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
|
||||
fi; test -z "$$fail"
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-recursive
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
|
||||
include_option=--etags-include; \
|
||||
empty_fix=.; \
|
||||
else \
|
||||
include_option=--include; \
|
||||
empty_fix=; \
|
||||
fi; \
|
||||
list='$(SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
test ! -f $$subdir/TAGS || \
|
||||
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
|
||||
fi; \
|
||||
done; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-recursive
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-recursive
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
|
||||
if test "$$subdir" = .; then :; else \
|
||||
$(am__make_dryrun) \
|
||||
|| test -d "$(distdir)/$$subdir" \
|
||||
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|
||||
|| exit 1; \
|
||||
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
|
||||
$(am__relativize); \
|
||||
new_distdir=$$reldir; \
|
||||
dir1=$$subdir; dir2="$(top_distdir)"; \
|
||||
$(am__relativize); \
|
||||
new_top_distdir=$$reldir; \
|
||||
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
|
||||
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
|
||||
($(am__cd) $$subdir && \
|
||||
$(MAKE) $(AM_MAKEFLAGS) \
|
||||
top_distdir="$$new_top_distdir" \
|
||||
distdir="$$new_distdir" \
|
||||
am__remove_distdir=: \
|
||||
am__skip_length_check=: \
|
||||
am__skip_mode_fix=: \
|
||||
distdir) \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-recursive
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs: installdirs-recursive
|
||||
installdirs-am:
|
||||
install: install-recursive
|
||||
install-exec: install-exec-recursive
|
||||
install-data: install-data-recursive
|
||||
uninstall: uninstall-recursive
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-recursive
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-recursive
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-recursive
|
||||
-rm -f ./$(DEPDIR)/hc2c.Plo
|
||||
-rm -f ./$(DEPDIR)/hfb.Plo
|
||||
-rm -f ./$(DEPDIR)/r2c.Plo
|
||||
-rm -f ./$(DEPDIR)/r2r.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-recursive
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-recursive
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-recursive
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-recursive
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-recursive
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-recursive
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-recursive
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-recursive
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-recursive
|
||||
-rm -f ./$(DEPDIR)/hc2c.Plo
|
||||
-rm -f ./$(DEPDIR)/hfb.Plo
|
||||
-rm -f ./$(DEPDIR)/r2c.Plo
|
||||
-rm -f ./$(DEPDIR)/r2r.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-recursive
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-recursive
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-recursive
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: $(am__recursive_targets) install-am install-strip
|
||||
|
||||
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
|
||||
am--depfiles check check-am clean clean-generic clean-libtool \
|
||||
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
|
||||
distclean-compile distclean-generic distclean-libtool \
|
||||
distclean-tags distdir dvi dvi-am html html-am info info-am \
|
||||
install install-am install-data install-data-am install-dvi \
|
||||
install-dvi-am install-exec install-exec-am install-html \
|
||||
install-html-am install-info install-info-am install-man \
|
||||
install-pdf install-pdf-am install-ps install-ps-am \
|
||||
install-strip installcheck installcheck-am installdirs \
|
||||
installdirs-am maintainer-clean maintainer-clean-generic \
|
||||
mostlyclean mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
|
||||
uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
23
fftw-3.3.10/rdft/scalar/hb.h
Normal file
23
fftw-3.3.10/rdft/scalar/hb.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(rdft_hb_genus)
|
||||
extern const hc2hc_genus GENUS;
|
||||
39
fftw-3.3.10/rdft/scalar/hc2c.c
Normal file
39
fftw-3.3.10/rdft/scalar/hc2c.c
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static int okp(const R *Rp, const R *Ip, const R *Rm, const R *Im,
|
||||
INT rs, INT mb, INT me, INT ms,
|
||||
const planner *plnr)
|
||||
{
|
||||
UNUSED(Rp); UNUSED(Ip); UNUSED(Rm); UNUSED(Im);
|
||||
UNUSED(rs); UNUSED(mb); UNUSED(me); UNUSED(ms); UNUSED(plnr);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
const hc2c_genus GENUS = { okp, R2HC, 1 };
|
||||
|
||||
#undef GENUS
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
const hc2c_genus GENUS = { okp, HC2R, 1 };
|
||||
23
fftw-3.3.10/rdft/scalar/hc2cb.h
Normal file
23
fftw-3.3.10/rdft/scalar/hc2cb.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(rdft_hc2cb_genus)
|
||||
extern const hc2c_genus GENUS;
|
||||
23
fftw-3.3.10/rdft/scalar/hc2cf.h
Normal file
23
fftw-3.3.10/rdft/scalar/hc2cf.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(rdft_hc2cf_genus)
|
||||
extern const hc2c_genus GENUS;
|
||||
23
fftw-3.3.10/rdft/scalar/hf.h
Normal file
23
fftw-3.3.10/rdft/scalar/hf.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(rdft_hf_genus)
|
||||
extern const hc2hc_genus GENUS;
|
||||
29
fftw-3.3.10/rdft/scalar/hfb.c
Normal file
29
fftw-3.3.10/rdft/scalar/hfb.c
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
const hc2hc_genus GENUS = { R2HC, 1 };
|
||||
|
||||
#undef GENUS
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
const hc2hc_genus GENUS = { HC2R, 1 };
|
||||
37
fftw-3.3.10/rdft/scalar/r2c.c
Normal file
37
fftw-3.3.10/rdft/scalar/r2c.c
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
const kr2c_genus GENUS = { R2HC, 1 };
|
||||
#undef GENUS
|
||||
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
const kr2c_genus GENUS = { R2HCII, 1 };
|
||||
#undef GENUS
|
||||
|
||||
#include "rdft/scalar/r2cb.h"
|
||||
const kr2c_genus GENUS = { HC2R, 1 };
|
||||
#undef GENUS
|
||||
|
||||
#include "rdft/scalar/r2cbIII.h"
|
||||
const kr2c_genus GENUS = { HC2RIII, 1 };
|
||||
#undef GENUS
|
||||
23
fftw-3.3.10/rdft/scalar/r2cb.h
Normal file
23
fftw-3.3.10/rdft/scalar/r2cb.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#define GENUS X(rdft_r2cb_genus)
|
||||
extern const kr2c_genus GENUS;
|
||||
109
fftw-3.3.10/rdft/scalar/r2cb/Makefile.am
Normal file
109
fftw-3.3.10/rdft/scalar/r2cb/Makefile.am
Normal file
@@ -0,0 +1,109 @@
|
||||
# This Makefile.am specifies a set of codelets, efficient transforms
|
||||
# of small sizes, that are used as building blocks (kernels) by FFTW
|
||||
# to build up large transforms, as well as the options for generating
|
||||
# and compiling them.
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
###########################################################################
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = librdft_scalar_r2cb.la
|
||||
|
||||
###########################################################################
|
||||
# r2cb_<n> is a hard-coded complex-to-real FFT of size <n> (base cases
|
||||
# of real-output FFT recursion)
|
||||
R2CB = r2cb_2.c r2cb_3.c r2cb_4.c r2cb_5.c r2cb_6.c r2cb_7.c r2cb_8.c \
|
||||
r2cb_9.c r2cb_10.c r2cb_11.c r2cb_12.c r2cb_13.c r2cb_14.c r2cb_15.c \
|
||||
r2cb_16.c r2cb_32.c r2cb_64.c r2cb_128.c r2cb_20.c r2cb_25.c
|
||||
# r2cb_30.c r2cb_40.c r2cb_50.c
|
||||
|
||||
###########################################################################
|
||||
# hb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
|
||||
# step for a real-output FFT. Every hb codelet must have a
|
||||
# corresponding r2cbIII codelet (see below)!
|
||||
HB = hb_2.c hb_3.c hb_4.c hb_5.c hb_6.c hb_7.c hb_8.c hb_9.c \
|
||||
hb_10.c hb_12.c hb_15.c hb_16.c hb_32.c hb_64.c \
|
||||
hb_20.c hb_25.c # hb_30.c hb_40.c hb_50.c
|
||||
|
||||
# like hb, but generates part of its trig table on the fly (good for large n)
|
||||
HB2 = hb2_4.c hb2_8.c hb2_16.c hb2_32.c \
|
||||
hb2_5.c hb2_20.c hb2_25.c
|
||||
|
||||
# an r2cb transform where the output is shifted by half a sample (input
|
||||
# is multiplied by a phase). This is needed as part of the DIF recursion;
|
||||
# every hb_<r> or hb2_<r> codelet should have a corresponding r2cbIII_<r>
|
||||
R2CBIII = r2cbIII_2.c r2cbIII_3.c r2cbIII_4.c r2cbIII_5.c r2cbIII_6.c \
|
||||
r2cbIII_7.c r2cbIII_8.c r2cbIII_9.c r2cbIII_10.c r2cbIII_12.c \
|
||||
r2cbIII_15.c r2cbIII_16.c r2cbIII_32.c r2cbIII_64.c \
|
||||
r2cbIII_20.c r2cbIII_25.c # r2cbIII_30.c r2cbIII_40.c r2cbIII_50.c
|
||||
|
||||
###########################################################################
|
||||
# hc2cb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
|
||||
# step for a real-input FFT with rdft2-style output. <r> must be even.
|
||||
HC2CB = hc2cb_2.c hc2cb_4.c hc2cb_6.c hc2cb_8.c hc2cb_10.c hc2cb_12.c \
|
||||
hc2cb_16.c hc2cb_32.c \
|
||||
hc2cb_20.c # hc2cb_30.c
|
||||
|
||||
HC2CBDFT = hc2cbdft_2.c hc2cbdft_4.c hc2cbdft_6.c hc2cbdft_8.c \
|
||||
hc2cbdft_10.c hc2cbdft_12.c hc2cbdft_16.c hc2cbdft_32.c \
|
||||
hc2cbdft_20.c # hc2cbdft_30.c
|
||||
|
||||
# like hc2cb, but generates part of its trig table on the fly (good
|
||||
# for large n)
|
||||
HC2CB2 = hc2cb2_4.c hc2cb2_8.c hc2cb2_16.c hc2cb2_32.c \
|
||||
hc2cb2_20.c # hc2cb2_30.c
|
||||
HC2CBDFT2 = hc2cbdft2_4.c hc2cbdft2_8.c hc2cbdft2_16.c hc2cbdft2_32.c \
|
||||
hc2cbdft2_20.c # hc2cbdft2_30.c
|
||||
|
||||
###########################################################################
|
||||
ALL_CODELETS = $(R2CB) $(HB) $(HB2) $(R2CBIII) $(HC2CB) $(HC2CB2) \
|
||||
$(HC2CBDFT) $(HC2CBDFT2)
|
||||
|
||||
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
|
||||
|
||||
librdft_scalar_r2cb_la_SOURCES = $(BUILT_SOURCES)
|
||||
|
||||
SOLVTAB_NAME = X(solvtab_rdft_r2cb)
|
||||
XRENAME=X
|
||||
|
||||
# special rules for regenerating codelets.
|
||||
include $(top_srcdir)/support/Makefile.codelets
|
||||
|
||||
if MAINTAINER_MODE
|
||||
FLAGS_R2CB=$(RDFT_FLAGS_COMMON) -sign 1
|
||||
FLAGS_HB=$(RDFT_FLAGS_COMMON) -sign 1
|
||||
FLAGS_HB2=$(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_HC2CB=$(RDFT_FLAGS_COMMON) -sign 1
|
||||
FLAGS_HC2CB2=$(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_R2CBIII=$(RDFT_FLAGS_COMMON) -sign 1
|
||||
|
||||
r2cb_%.c: $(CODELET_DEPS) $(GEN_R2CB)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cb_$* -include "rdft/scalar/r2cb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hb_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB) -n $* -dif -name hb_$* -include "rdft/scalar/hb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hb2_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB2) -n $* -dif -name hb2_$* -include "rdft/scalar/hb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
r2cbIII_%.c: $(CODELET_DEPS) $(GEN_R2CB)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cbIII_$* -dft-III -include "rdft/scalar/r2cbIII.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cb_%.c: $(CODELET_DEPS) $(GEN_HC2C)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB) -n $* -dif -name hc2cb_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cb2_%.c: $(CODELET_DEPS) $(GEN_HC2C)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB2) -n $* -dif -name hc2cb2_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cbdft_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cbdft2_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft2_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
endif # MAINTAINER_MODE
|
||||
1153
fftw-3.3.10/rdft/scalar/r2cb/Makefile.in
Normal file
1153
fftw-3.3.10/rdft/scalar/r2cb/Makefile.in
Normal file
File diff suppressed because it is too large
Load Diff
183
fftw-3.3.10/rdft/scalar/r2cb/codlist.c
Normal file
183
fftw-3.3.10/rdft/scalar/r2cb/codlist.c
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
|
||||
extern void X(codelet_r2cb_2)(planner *);
|
||||
extern void X(codelet_r2cb_3)(planner *);
|
||||
extern void X(codelet_r2cb_4)(planner *);
|
||||
extern void X(codelet_r2cb_5)(planner *);
|
||||
extern void X(codelet_r2cb_6)(planner *);
|
||||
extern void X(codelet_r2cb_7)(planner *);
|
||||
extern void X(codelet_r2cb_8)(planner *);
|
||||
extern void X(codelet_r2cb_9)(planner *);
|
||||
extern void X(codelet_r2cb_10)(planner *);
|
||||
extern void X(codelet_r2cb_11)(planner *);
|
||||
extern void X(codelet_r2cb_12)(planner *);
|
||||
extern void X(codelet_r2cb_13)(planner *);
|
||||
extern void X(codelet_r2cb_14)(planner *);
|
||||
extern void X(codelet_r2cb_15)(planner *);
|
||||
extern void X(codelet_r2cb_16)(planner *);
|
||||
extern void X(codelet_r2cb_32)(planner *);
|
||||
extern void X(codelet_r2cb_64)(planner *);
|
||||
extern void X(codelet_r2cb_128)(planner *);
|
||||
extern void X(codelet_r2cb_20)(planner *);
|
||||
extern void X(codelet_r2cb_25)(planner *);
|
||||
extern void X(codelet_hb_2)(planner *);
|
||||
extern void X(codelet_hb_3)(planner *);
|
||||
extern void X(codelet_hb_4)(planner *);
|
||||
extern void X(codelet_hb_5)(planner *);
|
||||
extern void X(codelet_hb_6)(planner *);
|
||||
extern void X(codelet_hb_7)(planner *);
|
||||
extern void X(codelet_hb_8)(planner *);
|
||||
extern void X(codelet_hb_9)(planner *);
|
||||
extern void X(codelet_hb_10)(planner *);
|
||||
extern void X(codelet_hb_12)(planner *);
|
||||
extern void X(codelet_hb_15)(planner *);
|
||||
extern void X(codelet_hb_16)(planner *);
|
||||
extern void X(codelet_hb_32)(planner *);
|
||||
extern void X(codelet_hb_64)(planner *);
|
||||
extern void X(codelet_hb_20)(planner *);
|
||||
extern void X(codelet_hb_25)(planner *);
|
||||
extern void X(codelet_hb2_4)(planner *);
|
||||
extern void X(codelet_hb2_8)(planner *);
|
||||
extern void X(codelet_hb2_16)(planner *);
|
||||
extern void X(codelet_hb2_32)(planner *);
|
||||
extern void X(codelet_hb2_5)(planner *);
|
||||
extern void X(codelet_hb2_20)(planner *);
|
||||
extern void X(codelet_hb2_25)(planner *);
|
||||
extern void X(codelet_r2cbIII_2)(planner *);
|
||||
extern void X(codelet_r2cbIII_3)(planner *);
|
||||
extern void X(codelet_r2cbIII_4)(planner *);
|
||||
extern void X(codelet_r2cbIII_5)(planner *);
|
||||
extern void X(codelet_r2cbIII_6)(planner *);
|
||||
extern void X(codelet_r2cbIII_7)(planner *);
|
||||
extern void X(codelet_r2cbIII_8)(planner *);
|
||||
extern void X(codelet_r2cbIII_9)(planner *);
|
||||
extern void X(codelet_r2cbIII_10)(planner *);
|
||||
extern void X(codelet_r2cbIII_12)(planner *);
|
||||
extern void X(codelet_r2cbIII_15)(planner *);
|
||||
extern void X(codelet_r2cbIII_16)(planner *);
|
||||
extern void X(codelet_r2cbIII_32)(planner *);
|
||||
extern void X(codelet_r2cbIII_64)(planner *);
|
||||
extern void X(codelet_r2cbIII_20)(planner *);
|
||||
extern void X(codelet_r2cbIII_25)(planner *);
|
||||
extern void X(codelet_hc2cb_2)(planner *);
|
||||
extern void X(codelet_hc2cb_4)(planner *);
|
||||
extern void X(codelet_hc2cb_6)(planner *);
|
||||
extern void X(codelet_hc2cb_8)(planner *);
|
||||
extern void X(codelet_hc2cb_10)(planner *);
|
||||
extern void X(codelet_hc2cb_12)(planner *);
|
||||
extern void X(codelet_hc2cb_16)(planner *);
|
||||
extern void X(codelet_hc2cb_32)(planner *);
|
||||
extern void X(codelet_hc2cb_20)(planner *);
|
||||
extern void X(codelet_hc2cb2_4)(planner *);
|
||||
extern void X(codelet_hc2cb2_8)(planner *);
|
||||
extern void X(codelet_hc2cb2_16)(planner *);
|
||||
extern void X(codelet_hc2cb2_32)(planner *);
|
||||
extern void X(codelet_hc2cb2_20)(planner *);
|
||||
extern void X(codelet_hc2cbdft_2)(planner *);
|
||||
extern void X(codelet_hc2cbdft_4)(planner *);
|
||||
extern void X(codelet_hc2cbdft_6)(planner *);
|
||||
extern void X(codelet_hc2cbdft_8)(planner *);
|
||||
extern void X(codelet_hc2cbdft_10)(planner *);
|
||||
extern void X(codelet_hc2cbdft_12)(planner *);
|
||||
extern void X(codelet_hc2cbdft_16)(planner *);
|
||||
extern void X(codelet_hc2cbdft_32)(planner *);
|
||||
extern void X(codelet_hc2cbdft_20)(planner *);
|
||||
extern void X(codelet_hc2cbdft2_4)(planner *);
|
||||
extern void X(codelet_hc2cbdft2_8)(planner *);
|
||||
extern void X(codelet_hc2cbdft2_16)(planner *);
|
||||
extern void X(codelet_hc2cbdft2_32)(planner *);
|
||||
extern void X(codelet_hc2cbdft2_20)(planner *);
|
||||
|
||||
|
||||
extern const solvtab X(solvtab_rdft_r2cb);
|
||||
const solvtab X(solvtab_rdft_r2cb) = {
|
||||
SOLVTAB(X(codelet_r2cb_2)),
|
||||
SOLVTAB(X(codelet_r2cb_3)),
|
||||
SOLVTAB(X(codelet_r2cb_4)),
|
||||
SOLVTAB(X(codelet_r2cb_5)),
|
||||
SOLVTAB(X(codelet_r2cb_6)),
|
||||
SOLVTAB(X(codelet_r2cb_7)),
|
||||
SOLVTAB(X(codelet_r2cb_8)),
|
||||
SOLVTAB(X(codelet_r2cb_9)),
|
||||
SOLVTAB(X(codelet_r2cb_10)),
|
||||
SOLVTAB(X(codelet_r2cb_11)),
|
||||
SOLVTAB(X(codelet_r2cb_12)),
|
||||
SOLVTAB(X(codelet_r2cb_13)),
|
||||
SOLVTAB(X(codelet_r2cb_14)),
|
||||
SOLVTAB(X(codelet_r2cb_15)),
|
||||
SOLVTAB(X(codelet_r2cb_16)),
|
||||
SOLVTAB(X(codelet_r2cb_32)),
|
||||
SOLVTAB(X(codelet_r2cb_64)),
|
||||
SOLVTAB(X(codelet_r2cb_128)),
|
||||
SOLVTAB(X(codelet_r2cb_20)),
|
||||
SOLVTAB(X(codelet_r2cb_25)),
|
||||
SOLVTAB(X(codelet_hb_2)),
|
||||
SOLVTAB(X(codelet_hb_3)),
|
||||
SOLVTAB(X(codelet_hb_4)),
|
||||
SOLVTAB(X(codelet_hb_5)),
|
||||
SOLVTAB(X(codelet_hb_6)),
|
||||
SOLVTAB(X(codelet_hb_7)),
|
||||
SOLVTAB(X(codelet_hb_8)),
|
||||
SOLVTAB(X(codelet_hb_9)),
|
||||
SOLVTAB(X(codelet_hb_10)),
|
||||
SOLVTAB(X(codelet_hb_12)),
|
||||
SOLVTAB(X(codelet_hb_15)),
|
||||
SOLVTAB(X(codelet_hb_16)),
|
||||
SOLVTAB(X(codelet_hb_32)),
|
||||
SOLVTAB(X(codelet_hb_64)),
|
||||
SOLVTAB(X(codelet_hb_20)),
|
||||
SOLVTAB(X(codelet_hb_25)),
|
||||
SOLVTAB(X(codelet_hb2_4)),
|
||||
SOLVTAB(X(codelet_hb2_8)),
|
||||
SOLVTAB(X(codelet_hb2_16)),
|
||||
SOLVTAB(X(codelet_hb2_32)),
|
||||
SOLVTAB(X(codelet_hb2_5)),
|
||||
SOLVTAB(X(codelet_hb2_20)),
|
||||
SOLVTAB(X(codelet_hb2_25)),
|
||||
SOLVTAB(X(codelet_r2cbIII_2)),
|
||||
SOLVTAB(X(codelet_r2cbIII_3)),
|
||||
SOLVTAB(X(codelet_r2cbIII_4)),
|
||||
SOLVTAB(X(codelet_r2cbIII_5)),
|
||||
SOLVTAB(X(codelet_r2cbIII_6)),
|
||||
SOLVTAB(X(codelet_r2cbIII_7)),
|
||||
SOLVTAB(X(codelet_r2cbIII_8)),
|
||||
SOLVTAB(X(codelet_r2cbIII_9)),
|
||||
SOLVTAB(X(codelet_r2cbIII_10)),
|
||||
SOLVTAB(X(codelet_r2cbIII_12)),
|
||||
SOLVTAB(X(codelet_r2cbIII_15)),
|
||||
SOLVTAB(X(codelet_r2cbIII_16)),
|
||||
SOLVTAB(X(codelet_r2cbIII_32)),
|
||||
SOLVTAB(X(codelet_r2cbIII_64)),
|
||||
SOLVTAB(X(codelet_r2cbIII_20)),
|
||||
SOLVTAB(X(codelet_r2cbIII_25)),
|
||||
SOLVTAB(X(codelet_hc2cb_2)),
|
||||
SOLVTAB(X(codelet_hc2cb_4)),
|
||||
SOLVTAB(X(codelet_hc2cb_6)),
|
||||
SOLVTAB(X(codelet_hc2cb_8)),
|
||||
SOLVTAB(X(codelet_hc2cb_10)),
|
||||
SOLVTAB(X(codelet_hc2cb_12)),
|
||||
SOLVTAB(X(codelet_hc2cb_16)),
|
||||
SOLVTAB(X(codelet_hc2cb_32)),
|
||||
SOLVTAB(X(codelet_hc2cb_20)),
|
||||
SOLVTAB(X(codelet_hc2cb2_4)),
|
||||
SOLVTAB(X(codelet_hc2cb2_8)),
|
||||
SOLVTAB(X(codelet_hc2cb2_16)),
|
||||
SOLVTAB(X(codelet_hc2cb2_32)),
|
||||
SOLVTAB(X(codelet_hc2cb2_20)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_2)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_4)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_6)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_8)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_10)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_12)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_16)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_32)),
|
||||
SOLVTAB(X(codelet_hc2cbdft_20)),
|
||||
SOLVTAB(X(codelet_hc2cbdft2_4)),
|
||||
SOLVTAB(X(codelet_hc2cbdft2_8)),
|
||||
SOLVTAB(X(codelet_hc2cbdft2_16)),
|
||||
SOLVTAB(X(codelet_hc2cbdft2_32)),
|
||||
SOLVTAB(X(codelet_hc2cbdft2_20)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
858
fftw-3.3.10/rdft/scalar/r2cb/hb2_16.c
Normal file
858
fftw-3.3.10/rdft/scalar/r2cb/hb2_16.c
Normal file
@@ -0,0 +1,858 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:55 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 134 FP multiplications,
|
||||
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
|
||||
* 93 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Tv, Tw, T2z, T2C, TB, TF, Ty, Tz, T1V, TA, T2G, T3Q, T3C, T3g, T3L;
|
||||
E T30, T3m, T3z, T3w, T3s, T1X, T1Y, T2u, T2c, T2p, TE, TG, T1G, T1o, T1D;
|
||||
{
|
||||
E T3f, T3l, T2F, T3r, T2Z, T3v, TD, Tx;
|
||||
Tv = W[0];
|
||||
Tw = W[2];
|
||||
Tx = Tv * Tw;
|
||||
T2z = W[6];
|
||||
T3f = Tv * T2z;
|
||||
T2C = W[7];
|
||||
T3l = Tv * T2C;
|
||||
TB = W[4];
|
||||
T2F = Tv * TB;
|
||||
T3r = Tw * TB;
|
||||
TF = W[5];
|
||||
T2Z = Tv * TF;
|
||||
T3v = Tw * TF;
|
||||
Ty = W[1];
|
||||
Tz = W[3];
|
||||
TD = Tv * Tz;
|
||||
T1V = FMA(Ty, Tz, Tx);
|
||||
TA = FNMS(Ty, Tz, Tx);
|
||||
T2G = FNMS(Ty, TF, T2F);
|
||||
T3Q = FMA(Tz, TB, T3v);
|
||||
T3C = FNMS(Ty, TB, T2Z);
|
||||
T3g = FMA(Ty, T2C, T3f);
|
||||
T3L = FNMS(Tz, TF, T3r);
|
||||
T30 = FMA(Ty, TB, T2Z);
|
||||
T3m = FNMS(Ty, T2z, T3l);
|
||||
T3z = FMA(Ty, TF, T2F);
|
||||
T3w = FNMS(Tz, TB, T3v);
|
||||
T3s = FMA(Tz, TF, T3r);
|
||||
{
|
||||
E T1W, T2b, TC, T1n;
|
||||
T1W = T1V * TB;
|
||||
T2b = T1V * TF;
|
||||
T1X = FNMS(Ty, Tw, TD);
|
||||
T1Y = FNMS(T1X, TF, T1W);
|
||||
T2u = FNMS(T1X, TB, T2b);
|
||||
T2c = FMA(T1X, TB, T2b);
|
||||
T2p = FMA(T1X, TF, T1W);
|
||||
TC = TA * TB;
|
||||
T1n = TA * TF;
|
||||
TE = FMA(Ty, Tw, TD);
|
||||
TG = FNMS(TE, TF, TC);
|
||||
T1G = FNMS(TE, TB, T1n);
|
||||
T1o = FMA(TE, TB, T1n);
|
||||
T1D = FMA(TE, TF, TC);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TL, T1Z, T2d, T1t, T31, T34, T3n, T3D, T3E, T3R, T1w, T20, Tf, T3M, T2L;
|
||||
E T3h, TW, T2e, T3G, T3H, T3N, T2Q, T36, T2V, T37, Tu, T3S, T18, T1z, T24;
|
||||
E T2g, T27, T2h, T1j, T1y;
|
||||
{
|
||||
E T3, TH, TU, T2I, T1s, T32, T6, T1p, Ta, TM, TK, T33, TP, T2J, Td;
|
||||
E TR;
|
||||
{
|
||||
E T1, T2, TS, TT;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
TH = T1 - T2;
|
||||
TS = ci[WS(rs, 9)];
|
||||
TT = cr[WS(rs, 14)];
|
||||
TU = TS + TT;
|
||||
T2I = TS - TT;
|
||||
}
|
||||
{
|
||||
E T1q, T1r, T4, T5;
|
||||
T1q = ci[WS(rs, 15)];
|
||||
T1r = cr[WS(rs, 8)];
|
||||
T1s = T1q + T1r;
|
||||
T32 = T1q - T1r;
|
||||
T4 = cr[WS(rs, 4)];
|
||||
T5 = ci[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T1p = T4 - T5;
|
||||
}
|
||||
{
|
||||
E T8, T9, TI, TJ;
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = ci[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TM = T8 - T9;
|
||||
TI = ci[WS(rs, 11)];
|
||||
TJ = cr[WS(rs, 12)];
|
||||
TK = TI + TJ;
|
||||
T33 = TI - TJ;
|
||||
}
|
||||
{
|
||||
E TN, TO, Tb, Tc;
|
||||
TN = ci[WS(rs, 13)];
|
||||
TO = cr[WS(rs, 10)];
|
||||
TP = TN + TO;
|
||||
T2J = TN - TO;
|
||||
Tb = ci[WS(rs, 1)];
|
||||
Tc = cr[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
TR = Tb - Tc;
|
||||
}
|
||||
TL = TH - TK;
|
||||
T1Z = TH + TK;
|
||||
T2d = T1s - T1p;
|
||||
T1t = T1p + T1s;
|
||||
T31 = Ta - Td;
|
||||
T34 = T32 - T33;
|
||||
T3n = T34 - T31;
|
||||
{
|
||||
E T1u, T1v, T7, Te;
|
||||
T3D = T32 + T33;
|
||||
T3E = T2J + T2I;
|
||||
T3R = T3D - T3E;
|
||||
T1u = TM + TP;
|
||||
T1v = TR + TU;
|
||||
T1w = T1u - T1v;
|
||||
T20 = T1u + T1v;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T3M = T7 - Te;
|
||||
{
|
||||
E T2H, T2K, TQ, TV;
|
||||
T2H = T3 - T6;
|
||||
T2K = T2I - T2J;
|
||||
T2L = T2H + T2K;
|
||||
T3h = T2H - T2K;
|
||||
TQ = TM - TP;
|
||||
TV = TR - TU;
|
||||
TW = TQ + TV;
|
||||
T2e = TQ - TV;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T1e, T1c, T2N, T1h, T2O, Tl, T19, Tp, T13, T11, T2S, T16, T2T, Ts;
|
||||
E TY, T2M, T2P;
|
||||
{
|
||||
E Tg, Th, T1a, T1b;
|
||||
Tg = cr[WS(rs, 1)];
|
||||
Th = ci[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T1e = Tg - Th;
|
||||
T1a = ci[WS(rs, 14)];
|
||||
T1b = cr[WS(rs, 9)];
|
||||
T1c = T1a + T1b;
|
||||
T2N = T1a - T1b;
|
||||
}
|
||||
{
|
||||
E T1f, T1g, Tj, Tk;
|
||||
T1f = ci[WS(rs, 10)];
|
||||
T1g = cr[WS(rs, 13)];
|
||||
T1h = T1f + T1g;
|
||||
T2O = T1f - T1g;
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = ci[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
T19 = Tj - Tk;
|
||||
}
|
||||
{
|
||||
E Tn, To, TZ, T10;
|
||||
Tn = ci[0];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T13 = Tn - To;
|
||||
TZ = ci[WS(rs, 8)];
|
||||
T10 = cr[WS(rs, 15)];
|
||||
T11 = TZ + T10;
|
||||
T2S = TZ - T10;
|
||||
}
|
||||
{
|
||||
E T14, T15, Tq, Tr;
|
||||
T14 = ci[WS(rs, 12)];
|
||||
T15 = cr[WS(rs, 11)];
|
||||
T16 = T14 + T15;
|
||||
T2T = T14 - T15;
|
||||
Tq = cr[WS(rs, 3)];
|
||||
Tr = ci[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TY = Tq - Tr;
|
||||
}
|
||||
T3G = T2N + T2O;
|
||||
T3H = T2S + T2T;
|
||||
T3N = T3H - T3G;
|
||||
T2M = Ti - Tl;
|
||||
T2P = T2N - T2O;
|
||||
T2Q = T2M - T2P;
|
||||
T36 = T2M + T2P;
|
||||
{
|
||||
E T2R, T2U, Tm, Tt;
|
||||
T2R = Tp - Ts;
|
||||
T2U = T2S - T2T;
|
||||
T2V = T2R + T2U;
|
||||
T37 = T2U - T2R;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T3S = Tm - Tt;
|
||||
}
|
||||
{
|
||||
E T12, T17, T22, T23;
|
||||
T12 = TY - T11;
|
||||
T17 = T13 - T16;
|
||||
T18 = FNMS(KP414213562, T17, T12);
|
||||
T1z = FMA(KP414213562, T12, T17);
|
||||
T22 = T1c - T19;
|
||||
T23 = T1e + T1h;
|
||||
T24 = FNMS(KP414213562, T23, T22);
|
||||
T2g = FMA(KP414213562, T22, T23);
|
||||
}
|
||||
{
|
||||
E T25, T26, T1d, T1i;
|
||||
T25 = TY + T11;
|
||||
T26 = T13 + T16;
|
||||
T27 = FNMS(KP414213562, T26, T25);
|
||||
T2h = FMA(KP414213562, T25, T26);
|
||||
T1d = T19 + T1c;
|
||||
T1i = T1e - T1h;
|
||||
T1j = FMA(KP414213562, T1i, T1d);
|
||||
T1y = FNMS(KP414213562, T1d, T1i);
|
||||
}
|
||||
}
|
||||
cr[0] = Tf + Tu;
|
||||
{
|
||||
E T3B, T3K, T3F, T3I, T3J, T3A;
|
||||
T3A = Tf - Tu;
|
||||
T3B = T3z * T3A;
|
||||
T3K = T3C * T3A;
|
||||
T3F = T3D + T3E;
|
||||
T3I = T3G + T3H;
|
||||
T3J = T3F - T3I;
|
||||
ci[0] = T3F + T3I;
|
||||
ci[WS(rs, 8)] = FMA(T3z, T3J, T3K);
|
||||
cr[WS(rs, 8)] = FNMS(T3C, T3J, T3B);
|
||||
}
|
||||
{
|
||||
E T3O, T3P, T3T, T3U;
|
||||
T3O = T3M - T3N;
|
||||
T3P = T3L * T3O;
|
||||
T3T = T3R - T3S;
|
||||
T3U = T3L * T3T;
|
||||
cr[WS(rs, 12)] = FNMS(T3Q, T3T, T3P);
|
||||
ci[WS(rs, 12)] = FMA(T3Q, T3O, T3U);
|
||||
}
|
||||
{
|
||||
E T3V, T3W, T3X, T3Y;
|
||||
T3V = T3M + T3N;
|
||||
T3W = TA * T3V;
|
||||
T3X = T3S + T3R;
|
||||
T3Y = TA * T3X;
|
||||
cr[WS(rs, 4)] = FNMS(TE, T3X, T3W);
|
||||
ci[WS(rs, 4)] = FMA(TE, T3V, T3Y);
|
||||
}
|
||||
{
|
||||
E T3j, T3t, T3p, T3x, T3i, T3o;
|
||||
T3i = T37 - T36;
|
||||
T3j = FNMS(KP707106781, T3i, T3h);
|
||||
T3t = FMA(KP707106781, T3i, T3h);
|
||||
T3o = T2Q - T2V;
|
||||
T3p = FNMS(KP707106781, T3o, T3n);
|
||||
T3x = FMA(KP707106781, T3o, T3n);
|
||||
{
|
||||
E T3k, T3q, T3u, T3y;
|
||||
T3k = T3g * T3j;
|
||||
cr[WS(rs, 14)] = FNMS(T3m, T3p, T3k);
|
||||
T3q = T3g * T3p;
|
||||
ci[WS(rs, 14)] = FMA(T3m, T3j, T3q);
|
||||
T3u = T3s * T3t;
|
||||
cr[WS(rs, 6)] = FNMS(T3w, T3x, T3u);
|
||||
T3y = T3s * T3x;
|
||||
ci[WS(rs, 6)] = FMA(T3w, T3t, T3y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2X, T3b, T39, T3d, T2W, T35, T38;
|
||||
T2W = T2Q + T2V;
|
||||
T2X = FNMS(KP707106781, T2W, T2L);
|
||||
T3b = FMA(KP707106781, T2W, T2L);
|
||||
T35 = T31 + T34;
|
||||
T38 = T36 + T37;
|
||||
T39 = FNMS(KP707106781, T38, T35);
|
||||
T3d = FMA(KP707106781, T38, T35);
|
||||
{
|
||||
E T2Y, T3a, T3c, T3e;
|
||||
T2Y = T2G * T2X;
|
||||
cr[WS(rs, 10)] = FNMS(T30, T39, T2Y);
|
||||
T3a = T30 * T2X;
|
||||
ci[WS(rs, 10)] = FMA(T2G, T39, T3a);
|
||||
T3c = T1V * T3b;
|
||||
cr[WS(rs, 2)] = FNMS(T1X, T3d, T3c);
|
||||
T3e = T1X * T3b;
|
||||
ci[WS(rs, 2)] = FMA(T1V, T3d, T3e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T29, T2l, T2j, T2n;
|
||||
{
|
||||
E T21, T28, T2f, T2i;
|
||||
T21 = FNMS(KP707106781, T20, T1Z);
|
||||
T28 = T24 + T27;
|
||||
T29 = FMA(KP923879532, T28, T21);
|
||||
T2l = FNMS(KP923879532, T28, T21);
|
||||
T2f = FMA(KP707106781, T2e, T2d);
|
||||
T2i = T2g - T2h;
|
||||
T2j = FNMS(KP923879532, T2i, T2f);
|
||||
T2n = FMA(KP923879532, T2i, T2f);
|
||||
}
|
||||
{
|
||||
E T2a, T2k, T2m, T2o;
|
||||
T2a = T1Y * T29;
|
||||
cr[WS(rs, 11)] = FNMS(T2c, T2j, T2a);
|
||||
T2k = T2c * T29;
|
||||
ci[WS(rs, 11)] = FMA(T1Y, T2j, T2k);
|
||||
T2m = Tw * T2l;
|
||||
cr[WS(rs, 3)] = FNMS(Tz, T2n, T2m);
|
||||
T2o = Tz * T2l;
|
||||
ci[WS(rs, 3)] = FMA(Tw, T2n, T2o);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1l, T1E, T1B, T1H;
|
||||
{
|
||||
E TX, T1k, T1x, T1A;
|
||||
TX = FNMS(KP707106781, TW, TL);
|
||||
T1k = T18 - T1j;
|
||||
T1l = FNMS(KP923879532, T1k, TX);
|
||||
T1E = FMA(KP923879532, T1k, TX);
|
||||
T1x = FNMS(KP707106781, T1w, T1t);
|
||||
T1A = T1y - T1z;
|
||||
T1B = FNMS(KP923879532, T1A, T1x);
|
||||
T1H = FMA(KP923879532, T1A, T1x);
|
||||
}
|
||||
{
|
||||
E T1m, T1C, T1F, T1I;
|
||||
T1m = TG * T1l;
|
||||
cr[WS(rs, 13)] = FNMS(T1o, T1B, T1m);
|
||||
T1C = T1o * T1l;
|
||||
ci[WS(rs, 13)] = FMA(TG, T1B, T1C);
|
||||
T1F = T1D * T1E;
|
||||
cr[WS(rs, 5)] = FNMS(T1G, T1H, T1F);
|
||||
T1I = T1G * T1E;
|
||||
ci[WS(rs, 5)] = FMA(T1D, T1H, T1I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2s, T2A, T2x, T2D;
|
||||
{
|
||||
E T2q, T2r, T2v, T2w;
|
||||
T2q = FMA(KP707106781, T20, T1Z);
|
||||
T2r = T2g + T2h;
|
||||
T2s = FNMS(KP923879532, T2r, T2q);
|
||||
T2A = FMA(KP923879532, T2r, T2q);
|
||||
T2v = FNMS(KP707106781, T2e, T2d);
|
||||
T2w = T27 - T24;
|
||||
T2x = FMA(KP923879532, T2w, T2v);
|
||||
T2D = FNMS(KP923879532, T2w, T2v);
|
||||
}
|
||||
{
|
||||
E T2t, T2y, T2B, T2E;
|
||||
T2t = T2p * T2s;
|
||||
cr[WS(rs, 7)] = FNMS(T2u, T2x, T2t);
|
||||
T2y = T2p * T2x;
|
||||
ci[WS(rs, 7)] = FMA(T2u, T2s, T2y);
|
||||
T2B = T2z * T2A;
|
||||
cr[WS(rs, 15)] = FNMS(T2C, T2D, T2B);
|
||||
T2E = T2z * T2D;
|
||||
ci[WS(rs, 15)] = FMA(T2C, T2A, T2E);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1L, T1R, T1P, T1T;
|
||||
{
|
||||
E T1J, T1K, T1N, T1O;
|
||||
T1J = FMA(KP707106781, TW, TL);
|
||||
T1K = T1y + T1z;
|
||||
T1L = FNMS(KP923879532, T1K, T1J);
|
||||
T1R = FMA(KP923879532, T1K, T1J);
|
||||
T1N = FMA(KP707106781, T1w, T1t);
|
||||
T1O = T1j + T18;
|
||||
T1P = FNMS(KP923879532, T1O, T1N);
|
||||
T1T = FMA(KP923879532, T1O, T1N);
|
||||
}
|
||||
{
|
||||
E T1M, T1Q, T1S, T1U;
|
||||
T1M = TB * T1L;
|
||||
cr[WS(rs, 9)] = FNMS(TF, T1P, T1M);
|
||||
T1Q = TB * T1P;
|
||||
ci[WS(rs, 9)] = FMA(TF, T1L, T1Q);
|
||||
T1S = Tv * T1R;
|
||||
cr[WS(rs, 1)] = FNMS(Ty, T1T, T1S);
|
||||
T1U = Tv * T1T;
|
||||
ci[WS(rs, 1)] = FMA(Ty, T1R, T1U);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
|
||||
|
||||
void X(codelet_hb2_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 108 FP multiplications,
|
||||
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
|
||||
* 80 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
|
||||
E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
|
||||
{
|
||||
E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
|
||||
{
|
||||
E T1m, T1s, T1o, T1r;
|
||||
Tv = W[0];
|
||||
Ty = W[1];
|
||||
T1l = W[2];
|
||||
T1n = W[3];
|
||||
T1m = Tv * T1l;
|
||||
T1s = Ty * T1l;
|
||||
T1o = Ty * T1n;
|
||||
T1r = Tv * T1n;
|
||||
T1p = T1m + T1o;
|
||||
T1t = T1r - T1s;
|
||||
T27 = T1r + T1s;
|
||||
T25 = T1m - T1o;
|
||||
Tz = W[5];
|
||||
TA = Ty * Tz;
|
||||
T1J = T1l * Tz;
|
||||
T15 = Tv * Tz;
|
||||
T1G = T1n * Tz;
|
||||
Tw = W[4];
|
||||
Tx = Tv * Tw;
|
||||
T1K = T1n * Tw;
|
||||
T16 = Ty * Tw;
|
||||
T1F = T1l * Tw;
|
||||
}
|
||||
TB = Tx - TA;
|
||||
T21 = T1J + T1K;
|
||||
T1P = T15 - T16;
|
||||
T1H = T1F + T1G;
|
||||
T1X = T1F - T1G;
|
||||
T17 = T15 + T16;
|
||||
T1L = T1J - T1K;
|
||||
T1N = Tx + TA;
|
||||
T1v = W[6];
|
||||
T1w = W[7];
|
||||
T1x = FMA(Tv, T1v, Ty * T1w);
|
||||
T1B = FNMS(Ty, T1v, Tv * T1w);
|
||||
{
|
||||
E T2D, T2E, T29, T2a;
|
||||
T2D = T25 * Tz;
|
||||
T2E = T27 * Tw;
|
||||
T2F = T2D + T2E;
|
||||
T2T = T2D - T2E;
|
||||
T29 = T25 * Tw;
|
||||
T2a = T27 * Tz;
|
||||
T2b = T29 - T2a;
|
||||
T2R = T29 + T2a;
|
||||
}
|
||||
{
|
||||
E T3h, T3i, T33, T34;
|
||||
T3h = T1p * Tz;
|
||||
T3i = T1t * Tw;
|
||||
T3j = T3h + T3i;
|
||||
T3x = T3h - T3i;
|
||||
T33 = T1p * Tw;
|
||||
T34 = T1t * Tz;
|
||||
T35 = T33 - T34;
|
||||
T3t = T33 + T34;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
|
||||
E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
|
||||
E T3e, T3o;
|
||||
{
|
||||
E T3, T2c, T1e, T2d, T6, T2G, T1b, T2H;
|
||||
{
|
||||
E T1, T2, T1c, T1d;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T2c = T1 - T2;
|
||||
T1c = ci[WS(rs, 11)];
|
||||
T1d = cr[WS(rs, 12)];
|
||||
T1e = T1c - T1d;
|
||||
T2d = T1c + T1d;
|
||||
}
|
||||
{
|
||||
E T4, T5, T19, T1a;
|
||||
T4 = cr[WS(rs, 4)];
|
||||
T5 = ci[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T2G = T4 - T5;
|
||||
T19 = ci[WS(rs, 15)];
|
||||
T1a = cr[WS(rs, 8)];
|
||||
T1b = T19 - T1a;
|
||||
T2H = T19 + T1a;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T36 = T2c + T2d;
|
||||
T3k = T2H - T2G;
|
||||
TC = T3 - T6;
|
||||
T1f = T1b - T1e;
|
||||
T2e = T2c - T2d;
|
||||
T2I = T2G + T2H;
|
||||
T1Q = T1b + T1e;
|
||||
}
|
||||
{
|
||||
E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
|
||||
{
|
||||
E T8, T9, TG, TH;
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = ci[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
T2f = T8 - T9;
|
||||
TG = ci[WS(rs, 13)];
|
||||
TH = cr[WS(rs, 10)];
|
||||
TI = TG - TH;
|
||||
T2g = TG + TH;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TD, TE;
|
||||
Tb = ci[WS(rs, 1)];
|
||||
Tc = cr[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
T2i = Tb - Tc;
|
||||
TD = ci[WS(rs, 9)];
|
||||
TE = cr[WS(rs, 14)];
|
||||
TF = TD - TE;
|
||||
T2j = TD + TE;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TJ = TF - TI;
|
||||
T1R = TI + TF;
|
||||
T18 = Ta - Td;
|
||||
{
|
||||
E T2J, T2K, T2h, T2k;
|
||||
T2J = T2f + T2g;
|
||||
T2K = T2i + T2j;
|
||||
T2L = KP707106781 * (T2J - T2K);
|
||||
T37 = KP707106781 * (T2J + T2K);
|
||||
T2h = T2f - T2g;
|
||||
T2k = T2i - T2j;
|
||||
T2l = KP707106781 * (T2h + T2k);
|
||||
T3l = KP707106781 * (T2h - T2k);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T2x, TR, T2y, Tl, T2u, TO, T2v, TL, TS;
|
||||
{
|
||||
E Tg, Th, TP, TQ;
|
||||
Tg = cr[WS(rs, 1)];
|
||||
Th = ci[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T2x = Tg - Th;
|
||||
TP = ci[WS(rs, 10)];
|
||||
TQ = cr[WS(rs, 13)];
|
||||
TR = TP - TQ;
|
||||
T2y = TP + TQ;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TM, TN;
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = ci[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
T2u = Tj - Tk;
|
||||
TM = ci[WS(rs, 14)];
|
||||
TN = cr[WS(rs, 9)];
|
||||
TO = TM - TN;
|
||||
T2v = TM + TN;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1T = TO + TR;
|
||||
TL = Ti - Tl;
|
||||
TS = TO - TR;
|
||||
TT = TL - TS;
|
||||
T1h = TL + TS;
|
||||
{
|
||||
E T2w, T2z, T39, T3a;
|
||||
T2w = T2u + T2v;
|
||||
T2z = T2x - T2y;
|
||||
T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
|
||||
T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
|
||||
T39 = T2x + T2y;
|
||||
T3a = T2v - T2u;
|
||||
T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
|
||||
T3n = FMA(KP382683432, T3a, KP923879532 * T39);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, T2q, T10, T2r, Ts, T2n, TX, T2o, TU, T11;
|
||||
{
|
||||
E Tn, To, TY, TZ;
|
||||
Tn = ci[0];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T2q = Tn - To;
|
||||
TY = ci[WS(rs, 12)];
|
||||
TZ = cr[WS(rs, 11)];
|
||||
T10 = TY - TZ;
|
||||
T2r = TY + TZ;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, TV, TW;
|
||||
Tq = cr[WS(rs, 3)];
|
||||
Tr = ci[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
T2n = Tq - Tr;
|
||||
TV = ci[WS(rs, 8)];
|
||||
TW = cr[WS(rs, 15)];
|
||||
TX = TV - TW;
|
||||
T2o = TV + TW;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T1U = TX + T10;
|
||||
TU = Tp - Ts;
|
||||
T11 = TX - T10;
|
||||
T12 = TU + T11;
|
||||
T1i = T11 - TU;
|
||||
{
|
||||
E T2p, T2s, T3c, T3d;
|
||||
T2p = T2n - T2o;
|
||||
T2s = T2q - T2r;
|
||||
T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
|
||||
T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
|
||||
T3c = T2q + T2r;
|
||||
T3d = T2n + T2o;
|
||||
T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
|
||||
T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T1O, T1S, T1V, T1W;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
T1O = Tf - Tu;
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T1W = T1S - T1V;
|
||||
cr[0] = Tf + Tu;
|
||||
ci[0] = T1S + T1V;
|
||||
cr[WS(rs, 8)] = FNMS(T1P, T1W, T1N * T1O);
|
||||
ci[WS(rs, 8)] = FMA(T1P, T1O, T1N * T1W);
|
||||
}
|
||||
{
|
||||
E T3g, T3r, T3q, T3s;
|
||||
{
|
||||
E T38, T3f, T3m, T3p;
|
||||
T38 = T36 - T37;
|
||||
T3f = T3b + T3e;
|
||||
T3g = T38 - T3f;
|
||||
T3r = T38 + T3f;
|
||||
T3m = T3k + T3l;
|
||||
T3p = T3n - T3o;
|
||||
T3q = T3m - T3p;
|
||||
T3s = T3m + T3p;
|
||||
}
|
||||
cr[WS(rs, 11)] = FNMS(T3j, T3q, T35 * T3g);
|
||||
ci[WS(rs, 11)] = FMA(T3j, T3g, T35 * T3q);
|
||||
cr[WS(rs, 3)] = FNMS(T1n, T3s, T1l * T3r);
|
||||
ci[WS(rs, 3)] = FMA(T1n, T3r, T1l * T3s);
|
||||
}
|
||||
{
|
||||
E T3w, T3B, T3A, T3C;
|
||||
{
|
||||
E T3u, T3v, T3y, T3z;
|
||||
T3u = T36 + T37;
|
||||
T3v = T3n + T3o;
|
||||
T3w = T3u - T3v;
|
||||
T3B = T3u + T3v;
|
||||
T3y = T3k - T3l;
|
||||
T3z = T3b - T3e;
|
||||
T3A = T3y + T3z;
|
||||
T3C = T3y - T3z;
|
||||
}
|
||||
cr[WS(rs, 7)] = FNMS(T3x, T3A, T3t * T3w);
|
||||
ci[WS(rs, 7)] = FMA(T3t, T3A, T3x * T3w);
|
||||
cr[WS(rs, 15)] = FNMS(T1w, T3C, T1v * T3B);
|
||||
ci[WS(rs, 15)] = FMA(T1v, T3C, T1w * T3B);
|
||||
}
|
||||
{
|
||||
E T14, T1q, T1k, T1u;
|
||||
{
|
||||
E TK, T13, T1g, T1j;
|
||||
TK = TC + TJ;
|
||||
T13 = KP707106781 * (TT + T12);
|
||||
T14 = TK - T13;
|
||||
T1q = TK + T13;
|
||||
T1g = T18 + T1f;
|
||||
T1j = KP707106781 * (T1h + T1i);
|
||||
T1k = T1g - T1j;
|
||||
T1u = T1g + T1j;
|
||||
}
|
||||
cr[WS(rs, 10)] = FNMS(T17, T1k, TB * T14);
|
||||
ci[WS(rs, 10)] = FMA(T17, T14, TB * T1k);
|
||||
cr[WS(rs, 2)] = FNMS(T1t, T1u, T1p * T1q);
|
||||
ci[WS(rs, 2)] = FMA(T1t, T1q, T1p * T1u);
|
||||
}
|
||||
{
|
||||
E T1A, T1I, T1E, T1M;
|
||||
{
|
||||
E T1y, T1z, T1C, T1D;
|
||||
T1y = TC - TJ;
|
||||
T1z = KP707106781 * (T1i - T1h);
|
||||
T1A = T1y - T1z;
|
||||
T1I = T1y + T1z;
|
||||
T1C = T1f - T18;
|
||||
T1D = KP707106781 * (TT - T12);
|
||||
T1E = T1C - T1D;
|
||||
T1M = T1C + T1D;
|
||||
}
|
||||
cr[WS(rs, 14)] = FNMS(T1B, T1E, T1x * T1A);
|
||||
ci[WS(rs, 14)] = FMA(T1x, T1E, T1B * T1A);
|
||||
cr[WS(rs, 6)] = FNMS(T1L, T1M, T1H * T1I);
|
||||
ci[WS(rs, 6)] = FMA(T1H, T1M, T1L * T1I);
|
||||
}
|
||||
{
|
||||
E T2C, T2S, T2Q, T2U;
|
||||
{
|
||||
E T2m, T2B, T2M, T2P;
|
||||
T2m = T2e - T2l;
|
||||
T2B = T2t - T2A;
|
||||
T2C = T2m - T2B;
|
||||
T2S = T2m + T2B;
|
||||
T2M = T2I - T2L;
|
||||
T2P = T2N - T2O;
|
||||
T2Q = T2M - T2P;
|
||||
T2U = T2M + T2P;
|
||||
}
|
||||
cr[WS(rs, 13)] = FNMS(T2F, T2Q, T2b * T2C);
|
||||
ci[WS(rs, 13)] = FMA(T2F, T2C, T2b * T2Q);
|
||||
cr[WS(rs, 5)] = FNMS(T2T, T2U, T2R * T2S);
|
||||
ci[WS(rs, 5)] = FMA(T2T, T2S, T2R * T2U);
|
||||
}
|
||||
{
|
||||
E T2X, T31, T30, T32;
|
||||
{
|
||||
E T2V, T2W, T2Y, T2Z;
|
||||
T2V = T2e + T2l;
|
||||
T2W = T2N + T2O;
|
||||
T2X = T2V - T2W;
|
||||
T31 = T2V + T2W;
|
||||
T2Y = T2I + T2L;
|
||||
T2Z = T2A + T2t;
|
||||
T30 = T2Y - T2Z;
|
||||
T32 = T2Y + T2Z;
|
||||
}
|
||||
cr[WS(rs, 9)] = FNMS(Tz, T30, Tw * T2X);
|
||||
ci[WS(rs, 9)] = FMA(Tw, T30, Tz * T2X);
|
||||
cr[WS(rs, 1)] = FNMS(Ty, T32, Tv * T31);
|
||||
ci[WS(rs, 1)] = FMA(Tv, T32, Ty * T31);
|
||||
}
|
||||
{
|
||||
E T20, T26, T24, T28;
|
||||
{
|
||||
E T1Y, T1Z, T22, T23;
|
||||
T1Y = T7 - Te;
|
||||
T1Z = T1U - T1T;
|
||||
T20 = T1Y - T1Z;
|
||||
T26 = T1Y + T1Z;
|
||||
T22 = T1Q - T1R;
|
||||
T23 = Tm - Tt;
|
||||
T24 = T22 - T23;
|
||||
T28 = T23 + T22;
|
||||
}
|
||||
cr[WS(rs, 12)] = FNMS(T21, T24, T1X * T20);
|
||||
ci[WS(rs, 12)] = FMA(T1X, T24, T21 * T20);
|
||||
cr[WS(rs, 4)] = FNMS(T27, T28, T25 * T26);
|
||||
ci[WS(rs, 4)] = FMA(T25, T28, T27 * T26);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
|
||||
|
||||
void X(codelet_hb2_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_16, &desc);
|
||||
}
|
||||
#endif
|
||||
1087
fftw-3.3.10/rdft/scalar/r2cb/hb2_20.c
Normal file
1087
fftw-3.3.10/rdft/scalar/r2cb/hb2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1642
fftw-3.3.10/rdft/scalar/r2cb/hb2_25.c
Normal file
1642
fftw-3.3.10/rdft/scalar/r2cb/hb2_25.c
Normal file
File diff suppressed because it is too large
Load Diff
1882
fftw-3.3.10/rdft/scalar/r2cb/hb2_32.c
Normal file
1882
fftw-3.3.10/rdft/scalar/r2cb/hb2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
194
fftw-3.3.10/rdft/scalar/r2cb/hb2_4.c
Normal file
194
fftw-3.3.10/rdft/scalar/r2cb/hb2_4.c
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:55 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hb2_4 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 33 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T7, Tb, T8, Ta, Tc, Tg, T9, Tf;
|
||||
T7 = W[0];
|
||||
Tb = W[3];
|
||||
T8 = W[2];
|
||||
T9 = T7 * T8;
|
||||
Tf = T7 * Tb;
|
||||
Ta = W[1];
|
||||
Tc = FMA(Ta, Tb, T9);
|
||||
Tg = FNMS(Ta, T8, Tf);
|
||||
{
|
||||
E T3, T6, Td, Tj, Tz, Tx, Tr, Tm, Tv, Ts, Tw, TA;
|
||||
{
|
||||
E Th, Ti, Tu, Tk, Tl, Tq, Tp, Tt;
|
||||
Th = ci[WS(rs, 3)];
|
||||
Ti = cr[WS(rs, 2)];
|
||||
Tu = Th + Ti;
|
||||
Tk = ci[WS(rs, 2)];
|
||||
Tl = cr[WS(rs, 3)];
|
||||
Tq = Tk + Tl;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tp = T1 - T2;
|
||||
T4 = cr[WS(rs, 1)];
|
||||
T5 = ci[0];
|
||||
T6 = T4 + T5;
|
||||
Tt = T4 - T5;
|
||||
}
|
||||
Td = T3 - T6;
|
||||
Tj = Th - Ti;
|
||||
Tz = Tu - Tt;
|
||||
Tx = Tp + Tq;
|
||||
Tr = Tp - Tq;
|
||||
Tm = Tk - Tl;
|
||||
Tv = Tt + Tu;
|
||||
}
|
||||
cr[0] = T3 + T6;
|
||||
ci[0] = Tj + Tm;
|
||||
Ts = T7 * Tr;
|
||||
cr[WS(rs, 1)] = FNMS(Ta, Tv, Ts);
|
||||
Tw = T7 * Tv;
|
||||
ci[WS(rs, 1)] = FMA(Ta, Tr, Tw);
|
||||
TA = T8 * Tz;
|
||||
ci[WS(rs, 3)] = FMA(Tb, Tx, TA);
|
||||
{
|
||||
E Ty, Te, To, Tn;
|
||||
Ty = T8 * Tx;
|
||||
cr[WS(rs, 3)] = FNMS(Tb, Tz, Ty);
|
||||
Te = Tc * Td;
|
||||
To = Tg * Td;
|
||||
Tn = Tj - Tm;
|
||||
cr[WS(rs, 2)] = FNMS(Tg, Tn, Te);
|
||||
ci[WS(rs, 2)] = FMA(Tc, Tn, To);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hb2_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hb2_4 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T7, T9, T8, Ta, Tb, Td;
|
||||
T7 = W[0];
|
||||
T9 = W[1];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Td = FNMS(T9, T8, T7 * Ta);
|
||||
{
|
||||
E T3, Tl, T6, To, Tg, Tp, Tj, Tm, Tc, Tk;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tl = T1 - T2;
|
||||
T4 = cr[WS(rs, 1)];
|
||||
T5 = ci[0];
|
||||
T6 = T4 + T5;
|
||||
To = T4 - T5;
|
||||
}
|
||||
{
|
||||
E Te, Tf, Th, Ti;
|
||||
Te = ci[WS(rs, 3)];
|
||||
Tf = cr[WS(rs, 2)];
|
||||
Tg = Te - Tf;
|
||||
Tp = Te + Tf;
|
||||
Th = ci[WS(rs, 2)];
|
||||
Ti = cr[WS(rs, 3)];
|
||||
Tj = Th - Ti;
|
||||
Tm = Th + Ti;
|
||||
}
|
||||
cr[0] = T3 + T6;
|
||||
ci[0] = Tg + Tj;
|
||||
Tc = T3 - T6;
|
||||
Tk = Tg - Tj;
|
||||
cr[WS(rs, 2)] = FNMS(Td, Tk, Tb * Tc);
|
||||
ci[WS(rs, 2)] = FMA(Td, Tc, Tb * Tk);
|
||||
{
|
||||
E Tn, Tq, Tr, Ts;
|
||||
Tn = Tl - Tm;
|
||||
Tq = To + Tp;
|
||||
cr[WS(rs, 1)] = FNMS(T9, Tq, T7 * Tn);
|
||||
ci[WS(rs, 1)] = FMA(T7, Tq, T9 * Tn);
|
||||
Tr = Tl + Tm;
|
||||
Ts = Tp - To;
|
||||
cr[WS(rs, 3)] = FNMS(Ta, Ts, T8 * Tr);
|
||||
ci[WS(rs, 3)] = FMA(T8, Ts, Ta * Tr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hb2_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_4, &desc);
|
||||
}
|
||||
#endif
|
||||
279
fftw-3.3.10/rdft/scalar/r2cb/hb2_5.c
Normal file
279
fftw-3.3.10/rdft/scalar/r2cb/hb2_5.c
Normal file
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:57 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 5 -dif -name hb2_5 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 40 FP multiplications,
|
||||
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
|
||||
* 37 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T9, TB, Tz, Tm, TC, TO, TG, TJ, TA, TF;
|
||||
T9 = W[0];
|
||||
TB = W[3];
|
||||
Tz = W[2];
|
||||
TA = T9 * Tz;
|
||||
TF = T9 * TB;
|
||||
Tm = W[1];
|
||||
TC = FNMS(Tm, TB, TA);
|
||||
TO = FNMS(Tm, Tz, TF);
|
||||
TG = FMA(Tm, Tz, TF);
|
||||
TJ = FMA(Tm, TB, TA);
|
||||
{
|
||||
E T1, Tb, TQ, Tw, T8, Ta, Tn, Tj, TL, Ts, Tq, Tr;
|
||||
{
|
||||
E T4, Tu, T7, Tv;
|
||||
T1 = cr[0];
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
Tu = T2 - T3;
|
||||
T5 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tv = T5 - T6;
|
||||
}
|
||||
Tb = T4 - T7;
|
||||
TQ = FNMS(KP618033988, Tu, Tv);
|
||||
Tw = FMA(KP618033988, Tv, Tu);
|
||||
T8 = T4 + T7;
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Tf, To, Ti, Tp;
|
||||
Tn = ci[WS(rs, 4)];
|
||||
{
|
||||
E Td, Te, Tg, Th;
|
||||
Td = ci[WS(rs, 3)];
|
||||
Te = cr[WS(rs, 4)];
|
||||
Tf = Td + Te;
|
||||
To = Td - Te;
|
||||
Tg = ci[WS(rs, 2)];
|
||||
Th = cr[WS(rs, 3)];
|
||||
Ti = Tg + Th;
|
||||
Tp = Tg - Th;
|
||||
}
|
||||
Tj = FMA(KP618033988, Ti, Tf);
|
||||
TL = FNMS(KP618033988, Tf, Ti);
|
||||
Ts = To - Tp;
|
||||
Tq = To + Tp;
|
||||
Tr = FNMS(KP250000000, Tq, Tn);
|
||||
}
|
||||
cr[0] = T1 + T8;
|
||||
ci[0] = Tn + Tq;
|
||||
{
|
||||
E Tk, TD, Tx, TH, Tc, Tt;
|
||||
Tc = FMA(KP559016994, Tb, Ta);
|
||||
Tk = FNMS(KP951056516, Tj, Tc);
|
||||
TD = FMA(KP951056516, Tj, Tc);
|
||||
Tt = FMA(KP559016994, Ts, Tr);
|
||||
Tx = FMA(KP951056516, Tw, Tt);
|
||||
TH = FNMS(KP951056516, Tw, Tt);
|
||||
{
|
||||
E Tl, Ty, TE, TI;
|
||||
Tl = T9 * Tk;
|
||||
cr[WS(rs, 1)] = FNMS(Tm, Tx, Tl);
|
||||
Ty = Tm * Tk;
|
||||
ci[WS(rs, 1)] = FMA(T9, Tx, Ty);
|
||||
TE = TC * TD;
|
||||
cr[WS(rs, 4)] = FNMS(TG, TH, TE);
|
||||
TI = TG * TD;
|
||||
ci[WS(rs, 4)] = FMA(TC, TH, TI);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, TT, TR, TV, TK, TP;
|
||||
TK = FNMS(KP559016994, Tb, Ta);
|
||||
TM = FMA(KP951056516, TL, TK);
|
||||
TT = FNMS(KP951056516, TL, TK);
|
||||
TP = FNMS(KP559016994, Ts, Tr);
|
||||
TR = FNMS(KP951056516, TQ, TP);
|
||||
TV = FMA(KP951056516, TQ, TP);
|
||||
{
|
||||
E TN, TS, TU, TW;
|
||||
TN = TJ * TM;
|
||||
cr[WS(rs, 2)] = FNMS(TO, TR, TN);
|
||||
TS = TO * TM;
|
||||
ci[WS(rs, 2)] = FMA(TJ, TR, TS);
|
||||
TU = Tz * TT;
|
||||
cr[WS(rs, 3)] = FNMS(TB, TV, TU);
|
||||
TW = TB * TT;
|
||||
ci[WS(rs, 3)] = FMA(Tz, TV, TW);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hb2_5", twinstr, &GENUS, { 14, 10, 30, 0 } };
|
||||
|
||||
void X(codelet_hb2_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 5 -dif -name hb2_5 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 32 FP multiplications,
|
||||
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 33 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E Th, Tk, Ti, Tl, Tn, TP, Tx, TN;
|
||||
{
|
||||
E Tj, Tw, Tm, Tv;
|
||||
Th = W[0];
|
||||
Tk = W[1];
|
||||
Ti = W[2];
|
||||
Tl = W[3];
|
||||
Tj = Th * Ti;
|
||||
Tw = Tk * Ti;
|
||||
Tm = Tk * Tl;
|
||||
Tv = Th * Tl;
|
||||
Tn = Tj + Tm;
|
||||
TP = Tv + Tw;
|
||||
Tx = Tv - Tw;
|
||||
TN = Tj - Tm;
|
||||
}
|
||||
{
|
||||
E T1, Tp, TK, TA, T8, To, T9, Tt, TI, TC, Tg, TB;
|
||||
{
|
||||
E T4, Ty, T7, Tz;
|
||||
T1 = cr[0];
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
Ty = T2 - T3;
|
||||
T5 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tz = T5 - T6;
|
||||
}
|
||||
Tp = KP559016994 * (T4 - T7);
|
||||
TK = FMA(KP951056516, Ty, KP587785252 * Tz);
|
||||
TA = FNMS(KP951056516, Tz, KP587785252 * Ty);
|
||||
T8 = T4 + T7;
|
||||
To = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Tc, Tr, Tf, Ts;
|
||||
T9 = ci[WS(rs, 4)];
|
||||
{
|
||||
E Ta, Tb, Td, Te;
|
||||
Ta = ci[WS(rs, 3)];
|
||||
Tb = cr[WS(rs, 4)];
|
||||
Tc = Ta - Tb;
|
||||
Tr = Ta + Tb;
|
||||
Td = ci[WS(rs, 2)];
|
||||
Te = cr[WS(rs, 3)];
|
||||
Tf = Td - Te;
|
||||
Ts = Td + Te;
|
||||
}
|
||||
Tt = FNMS(KP951056516, Ts, KP587785252 * Tr);
|
||||
TI = FMA(KP951056516, Tr, KP587785252 * Ts);
|
||||
TC = KP559016994 * (Tc - Tf);
|
||||
Tg = Tc + Tf;
|
||||
TB = FNMS(KP250000000, Tg, T9);
|
||||
}
|
||||
cr[0] = T1 + T8;
|
||||
ci[0] = T9 + Tg;
|
||||
{
|
||||
E Tu, TF, TE, TG, Tq, TD;
|
||||
Tq = To - Tp;
|
||||
Tu = Tq - Tt;
|
||||
TF = Tq + Tt;
|
||||
TD = TB - TC;
|
||||
TE = TA + TD;
|
||||
TG = TD - TA;
|
||||
cr[WS(rs, 2)] = FNMS(Tx, TE, Tn * Tu);
|
||||
ci[WS(rs, 2)] = FMA(Tn, TE, Tx * Tu);
|
||||
cr[WS(rs, 3)] = FNMS(Tl, TG, Ti * TF);
|
||||
ci[WS(rs, 3)] = FMA(Ti, TG, Tl * TF);
|
||||
}
|
||||
{
|
||||
E TJ, TO, TM, TQ, TH, TL;
|
||||
TH = Tp + To;
|
||||
TJ = TH - TI;
|
||||
TO = TH + TI;
|
||||
TL = TC + TB;
|
||||
TM = TK + TL;
|
||||
TQ = TL - TK;
|
||||
cr[WS(rs, 1)] = FNMS(Tk, TM, Th * TJ);
|
||||
ci[WS(rs, 1)] = FMA(Th, TM, Tk * TJ);
|
||||
cr[WS(rs, 4)] = FNMS(TP, TQ, TN * TO);
|
||||
ci[WS(rs, 4)] = FMA(TN, TQ, TP * TO);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hb2_5", twinstr, &GENUS, { 30, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hb2_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_5, &desc);
|
||||
}
|
||||
#endif
|
||||
387
fftw-3.3.10/rdft/scalar/r2cb/hb2_8.c
Normal file
387
fftw-3.3.10/rdft/scalar/r2cb/hb2_8.c
Normal file
@@ -0,0 +1,387 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:55 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 50 FP multiplications,
|
||||
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E Tf, Tg, Tl, Tp, Ti, Tj, Tk, T1b, T1u, T1e, T1o, To, Tq, TK;
|
||||
{
|
||||
E Th, T1n, T1t, Tn, Tm, TJ;
|
||||
Tf = W[0];
|
||||
Tg = W[2];
|
||||
Th = Tf * Tg;
|
||||
Tl = W[4];
|
||||
T1n = Tf * Tl;
|
||||
Tp = W[5];
|
||||
T1t = Tf * Tp;
|
||||
Ti = W[1];
|
||||
Tj = W[3];
|
||||
Tn = Tf * Tj;
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1b = FNMS(Ti, Tj, Th);
|
||||
T1u = FNMS(Ti, Tl, T1t);
|
||||
T1e = FMA(Ti, Tg, Tn);
|
||||
T1o = FMA(Ti, Tp, T1n);
|
||||
Tm = Tk * Tl;
|
||||
TJ = Tk * Tp;
|
||||
To = FNMS(Ti, Tg, Tn);
|
||||
Tq = FMA(To, Tp, Tm);
|
||||
TK = FNMS(To, Tl, TJ);
|
||||
}
|
||||
{
|
||||
E T7, T1p, T1v, Tv, TP, T13, T1h, TZ, Te, T1k, T1w, T1q, TQ, TR, T10;
|
||||
E TG, T14;
|
||||
{
|
||||
E T3, Tr, TO, T1f, T6, TL, Tu, T1g;
|
||||
{
|
||||
E T1, T2, TM, TN;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
Tr = T1 - T2;
|
||||
TM = ci[WS(rs, 7)];
|
||||
TN = cr[WS(rs, 4)];
|
||||
TO = TM + TN;
|
||||
T1f = TM - TN;
|
||||
}
|
||||
{
|
||||
E T4, T5, Ts, Tt;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
TL = T4 - T5;
|
||||
Ts = ci[WS(rs, 5)];
|
||||
Tt = cr[WS(rs, 6)];
|
||||
Tu = Ts + Tt;
|
||||
T1g = Ts - Tt;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1p = T3 - T6;
|
||||
T1v = T1f - T1g;
|
||||
Tv = Tr - Tu;
|
||||
TP = TL + TO;
|
||||
T13 = TO - TL;
|
||||
T1h = T1f + T1g;
|
||||
TZ = Tr + Tu;
|
||||
}
|
||||
{
|
||||
E Ta, Tw, TE, T1j, Td, TB, Tz, T1i, TA, TF;
|
||||
{
|
||||
E T8, T9, TC, TD;
|
||||
T8 = cr[WS(rs, 1)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tw = T8 - T9;
|
||||
TC = ci[WS(rs, 4)];
|
||||
TD = cr[WS(rs, 7)];
|
||||
TE = TC + TD;
|
||||
T1j = TC - TD;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tx, Ty;
|
||||
Tb = ci[0];
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
TB = Tb - Tc;
|
||||
Tx = ci[WS(rs, 6)];
|
||||
Ty = cr[WS(rs, 5)];
|
||||
Tz = Tx + Ty;
|
||||
T1i = Tx - Ty;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1k = T1i + T1j;
|
||||
T1w = Ta - Td;
|
||||
T1q = T1j - T1i;
|
||||
TQ = Tw + Tz;
|
||||
TR = TB + TE;
|
||||
T10 = TQ + TR;
|
||||
TA = Tw - Tz;
|
||||
TF = TB - TE;
|
||||
TG = TA + TF;
|
||||
T14 = TA - TF;
|
||||
}
|
||||
cr[0] = T7 + Te;
|
||||
ci[0] = T1h + T1k;
|
||||
{
|
||||
E T11, T12, T15, T16;
|
||||
T11 = FNMS(KP707106781, T10, TZ);
|
||||
T12 = Tg * T11;
|
||||
T15 = FMA(KP707106781, T14, T13);
|
||||
T16 = Tg * T15;
|
||||
cr[WS(rs, 3)] = FNMS(Tj, T15, T12);
|
||||
ci[WS(rs, 3)] = FMA(Tj, T11, T16);
|
||||
}
|
||||
{
|
||||
E T1z, T1A, T1B, T1C;
|
||||
T1z = T1p + T1q;
|
||||
T1A = Tk * T1z;
|
||||
T1B = T1w + T1v;
|
||||
T1C = Tk * T1B;
|
||||
cr[WS(rs, 2)] = FNMS(To, T1B, T1A);
|
||||
ci[WS(rs, 2)] = FMA(To, T1z, T1C);
|
||||
}
|
||||
{
|
||||
E T17, T18, T19, T1a;
|
||||
T17 = FMA(KP707106781, T10, TZ);
|
||||
T18 = Tl * T17;
|
||||
T19 = FNMS(KP707106781, T14, T13);
|
||||
T1a = Tl * T19;
|
||||
cr[WS(rs, 7)] = FNMS(Tp, T19, T18);
|
||||
ci[WS(rs, 7)] = FMA(Tp, T17, T1a);
|
||||
}
|
||||
{
|
||||
E T1l, T1d, T1m, T1c;
|
||||
T1l = T1h - T1k;
|
||||
T1c = T7 - Te;
|
||||
T1d = T1b * T1c;
|
||||
T1m = T1e * T1c;
|
||||
cr[WS(rs, 4)] = FNMS(T1e, T1l, T1d);
|
||||
ci[WS(rs, 4)] = FMA(T1b, T1l, T1m);
|
||||
}
|
||||
{
|
||||
E T1r, T1s, T1x, T1y;
|
||||
T1r = T1p - T1q;
|
||||
T1s = T1o * T1r;
|
||||
T1x = T1v - T1w;
|
||||
T1y = T1o * T1x;
|
||||
cr[WS(rs, 6)] = FNMS(T1u, T1x, T1s);
|
||||
ci[WS(rs, 6)] = FMA(T1u, T1r, T1y);
|
||||
}
|
||||
{
|
||||
E TT, TX, TW, TY, TI, TU, TS, TV, TH;
|
||||
TS = TQ - TR;
|
||||
TT = FNMS(KP707106781, TS, TP);
|
||||
TX = FMA(KP707106781, TS, TP);
|
||||
TV = FMA(KP707106781, TG, Tv);
|
||||
TW = Tf * TV;
|
||||
TY = Ti * TV;
|
||||
TH = FNMS(KP707106781, TG, Tv);
|
||||
TI = Tq * TH;
|
||||
TU = TK * TH;
|
||||
cr[WS(rs, 5)] = FNMS(TK, TT, TI);
|
||||
ci[WS(rs, 5)] = FMA(Tq, TT, TU);
|
||||
cr[WS(rs, 1)] = FNMS(Ti, TX, TW);
|
||||
ci[WS(rs, 1)] = FMA(Tf, TX, TY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
|
||||
|
||||
void X(codelet_hb2_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 44 FP multiplications,
|
||||
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
|
||||
* 46 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
|
||||
{
|
||||
E Th, To, Tk, Tn;
|
||||
Tf = W[0];
|
||||
Ti = W[1];
|
||||
Tg = W[2];
|
||||
Tj = W[3];
|
||||
Th = Tf * Tg;
|
||||
To = Ti * Tg;
|
||||
Tk = Ti * Tj;
|
||||
Tn = Tf * Tj;
|
||||
Tl = Th - Tk;
|
||||
Tp = Tn + To;
|
||||
TP = Th + Tk;
|
||||
TR = Tn - To;
|
||||
TF = W[4];
|
||||
TG = W[5];
|
||||
TH = FMA(Tf, TF, Ti * TG);
|
||||
T15 = FNMS(TR, TF, TP * TG);
|
||||
TL = FNMS(Ti, TF, Tf * TG);
|
||||
TT = FMA(TP, TF, TR * TG);
|
||||
}
|
||||
{
|
||||
E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
|
||||
E TN, Tm, TE;
|
||||
{
|
||||
E T3, TU, Tv, TV, T6, T16, Ts, T17;
|
||||
{
|
||||
E T1, T2, Tt, Tu;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
TU = T1 - T2;
|
||||
Tt = ci[WS(rs, 5)];
|
||||
Tu = cr[WS(rs, 6)];
|
||||
Tv = Tt - Tu;
|
||||
TV = Tt + Tu;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tq, Tr;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
T16 = T4 - T5;
|
||||
Tq = ci[WS(rs, 7)];
|
||||
Tr = cr[WS(rs, 4)];
|
||||
Ts = Tq - Tr;
|
||||
T17 = Tq + Tr;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1f = TU + TV;
|
||||
T1i = T17 - T16;
|
||||
Tw = Ts + Tv;
|
||||
TI = T3 - T6;
|
||||
TW = TU - TV;
|
||||
T18 = T16 + T17;
|
||||
TM = Ts - Tv;
|
||||
}
|
||||
{
|
||||
E Ta, TX, TC, T11, Td, T10, Tz, TY;
|
||||
{
|
||||
E T8, T9, TA, TB;
|
||||
T8 = cr[WS(rs, 1)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
TX = T8 - T9;
|
||||
TA = ci[WS(rs, 4)];
|
||||
TB = cr[WS(rs, 7)];
|
||||
TC = TA - TB;
|
||||
T11 = TA + TB;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tx, Ty;
|
||||
Tb = ci[0];
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
T10 = Tb - Tc;
|
||||
Tx = ci[WS(rs, 6)];
|
||||
Ty = cr[WS(rs, 5)];
|
||||
Tz = Tx - Ty;
|
||||
TY = Tx + Ty;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T19 = TX + TY;
|
||||
T1a = T10 + T11;
|
||||
TD = Tz + TC;
|
||||
TJ = TC - Tz;
|
||||
TZ = TX - TY;
|
||||
T12 = T10 - T11;
|
||||
TN = Ta - Td;
|
||||
}
|
||||
cr[0] = T7 + Te;
|
||||
ci[0] = Tw + TD;
|
||||
Tm = T7 - Te;
|
||||
TE = Tw - TD;
|
||||
cr[WS(rs, 4)] = FNMS(Tp, TE, Tl * Tm);
|
||||
ci[WS(rs, 4)] = FMA(Tp, Tm, Tl * TE);
|
||||
{
|
||||
E TQ, TS, TK, TO;
|
||||
TQ = TI + TJ;
|
||||
TS = TN + TM;
|
||||
cr[WS(rs, 2)] = FNMS(TR, TS, TP * TQ);
|
||||
ci[WS(rs, 2)] = FMA(TP, TS, TR * TQ);
|
||||
TK = TI - TJ;
|
||||
TO = TM - TN;
|
||||
cr[WS(rs, 6)] = FNMS(TL, TO, TH * TK);
|
||||
ci[WS(rs, 6)] = FMA(TH, TO, TL * TK);
|
||||
}
|
||||
{
|
||||
E T1h, T1l, T1k, T1m, T1g, T1j;
|
||||
T1g = KP707106781 * (T19 + T1a);
|
||||
T1h = T1f - T1g;
|
||||
T1l = T1f + T1g;
|
||||
T1j = KP707106781 * (TZ - T12);
|
||||
T1k = T1i + T1j;
|
||||
T1m = T1i - T1j;
|
||||
cr[WS(rs, 3)] = FNMS(Tj, T1k, Tg * T1h);
|
||||
ci[WS(rs, 3)] = FMA(Tg, T1k, Tj * T1h);
|
||||
cr[WS(rs, 7)] = FNMS(TG, T1m, TF * T1l);
|
||||
ci[WS(rs, 7)] = FMA(TF, T1m, TG * T1l);
|
||||
}
|
||||
{
|
||||
E T14, T1d, T1c, T1e, T13, T1b;
|
||||
T13 = KP707106781 * (TZ + T12);
|
||||
T14 = TW - T13;
|
||||
T1d = TW + T13;
|
||||
T1b = KP707106781 * (T19 - T1a);
|
||||
T1c = T18 - T1b;
|
||||
T1e = T18 + T1b;
|
||||
cr[WS(rs, 5)] = FNMS(T15, T1c, TT * T14);
|
||||
ci[WS(rs, 5)] = FMA(T15, T14, TT * T1c);
|
||||
cr[WS(rs, 1)] = FNMS(Ti, T1e, Tf * T1d);
|
||||
ci[WS(rs, 1)] = FMA(Ti, T1d, Tf * T1e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
|
||||
|
||||
void X(codelet_hb2_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hb2_8, &desc);
|
||||
}
|
||||
#endif
|
||||
513
fftw-3.3.10/rdft/scalar/r2cb/hb_10.c
Normal file
513
fftw-3.3.10/rdft/scalar/r2cb/hb_10.c
Normal file
@@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hb_10 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 72 FP multiplications,
|
||||
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 47 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E TH, T1B, TB, T11, T1E, T1G, TK, TM, T1x, T1V, T3, T1g, Tl, T1I, T1J;
|
||||
E TO, TP, T1p, Ti, Tk, T1n, T1o, TF, TG;
|
||||
TF = ci[WS(rs, 9)];
|
||||
TG = cr[WS(rs, 5)];
|
||||
TH = TF - TG;
|
||||
T1B = TF + TG;
|
||||
{
|
||||
E Tp, T1u, Tz, T1s, Ts, T1v, Tw, T1r;
|
||||
{
|
||||
E Tn, To, Tx, Ty;
|
||||
Tn = ci[WS(rs, 5)];
|
||||
To = cr[WS(rs, 9)];
|
||||
Tp = Tn - To;
|
||||
T1u = Tn + To;
|
||||
Tx = ci[WS(rs, 6)];
|
||||
Ty = cr[WS(rs, 8)];
|
||||
Tz = Tx - Ty;
|
||||
T1s = Tx + Ty;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, Tu, Tv;
|
||||
Tq = ci[WS(rs, 8)];
|
||||
Tr = cr[WS(rs, 6)];
|
||||
Ts = Tq - Tr;
|
||||
T1v = Tq + Tr;
|
||||
Tu = ci[WS(rs, 7)];
|
||||
Tv = cr[WS(rs, 7)];
|
||||
Tw = Tu - Tv;
|
||||
T1r = Tu + Tv;
|
||||
}
|
||||
{
|
||||
E Tt, TA, T1C, T1D;
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
TB = FNMS(KP618033988, TA, Tt);
|
||||
T11 = FMA(KP618033988, Tt, TA);
|
||||
T1C = T1r - T1s;
|
||||
T1D = T1u - T1v;
|
||||
T1E = T1C + T1D;
|
||||
T1G = T1C - T1D;
|
||||
}
|
||||
{
|
||||
E TI, TJ, T1t, T1w;
|
||||
TI = Tw + Tz;
|
||||
TJ = Tp + Ts;
|
||||
TK = TI + TJ;
|
||||
TM = TI - TJ;
|
||||
T1t = T1r + T1s;
|
||||
T1w = T1u + T1v;
|
||||
T1x = FMA(KP618033988, T1w, T1t);
|
||||
T1V = FNMS(KP618033988, T1t, T1w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Td, T1k, Tg, T1l, Th, T1m, T6, T1h, T9, T1i, Ta, T1j, T1, T2;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
T1g = T1 - T2;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = cr[WS(rs, 4)];
|
||||
Tc = ci[0];
|
||||
Td = Tb + Tc;
|
||||
T1k = Tb - Tc;
|
||||
Te = ci[WS(rs, 3)];
|
||||
Tf = cr[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
T1l = Te - Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
T1m = T1k + T1l;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 2)];
|
||||
T6 = T4 + T5;
|
||||
T1h = T4 - T5;
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
T1i = T7 - T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1j = T1h + T1i;
|
||||
Tl = Ta - Th;
|
||||
T1I = T1h - T1i;
|
||||
T1J = T1k - T1l;
|
||||
TO = Td - Tg;
|
||||
TP = T6 - T9;
|
||||
T1p = T1j - T1m;
|
||||
Ti = Ta + Th;
|
||||
Tk = FNMS(KP250000000, Ti, T3);
|
||||
T1n = T1j + T1m;
|
||||
T1o = FNMS(KP250000000, T1n, T1g);
|
||||
}
|
||||
cr[0] = T3 + Ti;
|
||||
ci[0] = TH + TK;
|
||||
{
|
||||
E T2d, T29, T2b, T2c, T2e, T2a;
|
||||
T2d = T1B + T1E;
|
||||
T2a = T1g + T1n;
|
||||
T29 = W[8];
|
||||
T2b = T29 * T2a;
|
||||
T2c = W[9];
|
||||
T2e = T2c * T2a;
|
||||
cr[WS(rs, 5)] = FNMS(T2c, T2d, T2b);
|
||||
ci[WS(rs, 5)] = FMA(T29, T2d, T2e);
|
||||
}
|
||||
{
|
||||
E TQ, T16, TC, TU, TN, T15, T12, T1a, Tm, TL, T10;
|
||||
TQ = FNMS(KP618033988, TP, TO);
|
||||
T16 = FMA(KP618033988, TO, TP);
|
||||
Tm = FNMS(KP559016994, Tl, Tk);
|
||||
TC = FMA(KP951056516, TB, Tm);
|
||||
TU = FNMS(KP951056516, TB, Tm);
|
||||
TL = FNMS(KP250000000, TK, TH);
|
||||
TN = FNMS(KP559016994, TM, TL);
|
||||
T15 = FMA(KP559016994, TM, TL);
|
||||
T10 = FMA(KP559016994, Tl, Tk);
|
||||
T12 = FMA(KP951056516, T11, T10);
|
||||
T1a = FNMS(KP951056516, T11, T10);
|
||||
{
|
||||
E TR, TE, TS, Tj, TD;
|
||||
TR = FNMS(KP951056516, TQ, TN);
|
||||
TE = W[3];
|
||||
TS = TE * TC;
|
||||
Tj = W[2];
|
||||
TD = Tj * TC;
|
||||
cr[WS(rs, 2)] = FNMS(TE, TR, TD);
|
||||
ci[WS(rs, 2)] = FMA(Tj, TR, TS);
|
||||
}
|
||||
{
|
||||
E T1d, T1c, T1e, T19, T1b;
|
||||
T1d = FMA(KP951056516, T16, T15);
|
||||
T1c = W[11];
|
||||
T1e = T1c * T1a;
|
||||
T19 = W[10];
|
||||
T1b = T19 * T1a;
|
||||
cr[WS(rs, 6)] = FNMS(T1c, T1d, T1b);
|
||||
ci[WS(rs, 6)] = FMA(T19, T1d, T1e);
|
||||
}
|
||||
{
|
||||
E TX, TW, TY, TT, TV;
|
||||
TX = FMA(KP951056516, TQ, TN);
|
||||
TW = W[15];
|
||||
TY = TW * TU;
|
||||
TT = W[14];
|
||||
TV = TT * TU;
|
||||
cr[WS(rs, 8)] = FNMS(TW, TX, TV);
|
||||
ci[WS(rs, 8)] = FMA(TT, TX, TY);
|
||||
}
|
||||
{
|
||||
E T17, T14, T18, TZ, T13;
|
||||
T17 = FNMS(KP951056516, T16, T15);
|
||||
T14 = W[7];
|
||||
T18 = T14 * T12;
|
||||
TZ = W[6];
|
||||
T13 = TZ * T12;
|
||||
cr[WS(rs, 4)] = FNMS(T14, T17, T13);
|
||||
ci[WS(rs, 4)] = FMA(TZ, T17, T18);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1K, T20, T1y, T1O, T1H, T1Z, T1W, T24, T1q, T1F, T1U;
|
||||
T1K = FMA(KP618033988, T1J, T1I);
|
||||
T20 = FNMS(KP618033988, T1I, T1J);
|
||||
T1q = FMA(KP559016994, T1p, T1o);
|
||||
T1y = FNMS(KP951056516, T1x, T1q);
|
||||
T1O = FMA(KP951056516, T1x, T1q);
|
||||
T1F = FNMS(KP250000000, T1E, T1B);
|
||||
T1H = FMA(KP559016994, T1G, T1F);
|
||||
T1Z = FNMS(KP559016994, T1G, T1F);
|
||||
T1U = FNMS(KP559016994, T1p, T1o);
|
||||
T1W = FNMS(KP951056516, T1V, T1U);
|
||||
T24 = FMA(KP951056516, T1V, T1U);
|
||||
{
|
||||
E T1L, T1A, T1M, T1f, T1z;
|
||||
T1L = FMA(KP951056516, T1K, T1H);
|
||||
T1A = W[1];
|
||||
T1M = T1A * T1y;
|
||||
T1f = W[0];
|
||||
T1z = T1f * T1y;
|
||||
cr[WS(rs, 1)] = FNMS(T1A, T1L, T1z);
|
||||
ci[WS(rs, 1)] = FMA(T1f, T1L, T1M);
|
||||
}
|
||||
{
|
||||
E T27, T26, T28, T23, T25;
|
||||
T27 = FNMS(KP951056516, T20, T1Z);
|
||||
T26 = W[13];
|
||||
T28 = T26 * T24;
|
||||
T23 = W[12];
|
||||
T25 = T23 * T24;
|
||||
cr[WS(rs, 7)] = FNMS(T26, T27, T25);
|
||||
ci[WS(rs, 7)] = FMA(T23, T27, T28);
|
||||
}
|
||||
{
|
||||
E T1R, T1Q, T1S, T1N, T1P;
|
||||
T1R = FNMS(KP951056516, T1K, T1H);
|
||||
T1Q = W[17];
|
||||
T1S = T1Q * T1O;
|
||||
T1N = W[16];
|
||||
T1P = T1N * T1O;
|
||||
cr[WS(rs, 9)] = FNMS(T1Q, T1R, T1P);
|
||||
ci[WS(rs, 9)] = FMA(T1N, T1R, T1S);
|
||||
}
|
||||
{
|
||||
E T21, T1Y, T22, T1T, T1X;
|
||||
T21 = FMA(KP951056516, T20, T1Z);
|
||||
T1Y = W[5];
|
||||
T22 = T1Y * T1W;
|
||||
T1T = W[4];
|
||||
T1X = T1T * T1W;
|
||||
cr[WS(rs, 3)] = FNMS(T1Y, T21, T1X);
|
||||
ci[WS(rs, 3)] = FMA(T1T, T21, T22);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 10, "hb_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
|
||||
|
||||
void X(codelet_hb_10) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_10, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hb_10 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 60 FP multiplications,
|
||||
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 41 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T3, T18, TE, TF, T1B, T1A, T1f, T1t, Ti, Tl, TJ, T1i, Tt, TA, T1w;
|
||||
E T1v, T1p, T1E, TM, TO;
|
||||
{
|
||||
E T1, T2, TH, TI;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
T18 = T1 - T2;
|
||||
{
|
||||
E T6, T19, Tg, T1d, T9, T1a, Td, T1c;
|
||||
{
|
||||
E T4, T5, Te, Tf;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 2)];
|
||||
T6 = T4 + T5;
|
||||
T19 = T4 - T5;
|
||||
Te = ci[WS(rs, 3)];
|
||||
Tf = cr[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
T1d = Te - Tf;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tb, Tc;
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
T1a = T7 - T8;
|
||||
Tb = cr[WS(rs, 4)];
|
||||
Tc = ci[0];
|
||||
Td = Tb + Tc;
|
||||
T1c = Tb - Tc;
|
||||
}
|
||||
TE = T6 - T9;
|
||||
TF = Td - Tg;
|
||||
T1B = T1c - T1d;
|
||||
T1A = T19 - T1a;
|
||||
{
|
||||
E T1b, T1e, Ta, Th;
|
||||
T1b = T19 + T1a;
|
||||
T1e = T1c + T1d;
|
||||
T1f = T1b + T1e;
|
||||
T1t = KP559016994 * (T1b - T1e);
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
Tl = KP559016994 * (Ta - Th);
|
||||
}
|
||||
}
|
||||
TH = ci[WS(rs, 9)];
|
||||
TI = cr[WS(rs, 5)];
|
||||
TJ = TH - TI;
|
||||
T1i = TH + TI;
|
||||
{
|
||||
E Tp, T1j, Tz, T1n, Ts, T1k, Tw, T1m;
|
||||
{
|
||||
E Tn, To, Tx, Ty;
|
||||
Tn = ci[WS(rs, 7)];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn - To;
|
||||
T1j = Tn + To;
|
||||
Tx = ci[WS(rs, 8)];
|
||||
Ty = cr[WS(rs, 6)];
|
||||
Tz = Tx - Ty;
|
||||
T1n = Tx + Ty;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, Tu, Tv;
|
||||
Tq = ci[WS(rs, 6)];
|
||||
Tr = cr[WS(rs, 8)];
|
||||
Ts = Tq - Tr;
|
||||
T1k = Tq + Tr;
|
||||
Tu = ci[WS(rs, 5)];
|
||||
Tv = cr[WS(rs, 9)];
|
||||
Tw = Tu - Tv;
|
||||
T1m = Tu + Tv;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
T1w = T1m + T1n;
|
||||
T1v = T1j + T1k;
|
||||
{
|
||||
E T1l, T1o, TK, TL;
|
||||
T1l = T1j - T1k;
|
||||
T1o = T1m - T1n;
|
||||
T1p = T1l + T1o;
|
||||
T1E = KP559016994 * (T1l - T1o);
|
||||
TK = Tp + Ts;
|
||||
TL = Tw + Tz;
|
||||
TM = TK + TL;
|
||||
TO = KP559016994 * (TK - TL);
|
||||
}
|
||||
}
|
||||
}
|
||||
cr[0] = T3 + Ti;
|
||||
ci[0] = TJ + TM;
|
||||
{
|
||||
E T1g, T1q, T17, T1h;
|
||||
T1g = T18 + T1f;
|
||||
T1q = T1i + T1p;
|
||||
T17 = W[8];
|
||||
T1h = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(T1h, T1q, T17 * T1g);
|
||||
ci[WS(rs, 5)] = FMA(T1h, T1g, T17 * T1q);
|
||||
}
|
||||
{
|
||||
E TB, TG, T11, TX, TP, T10, Tm, TW, TN, Tk;
|
||||
TB = FNMS(KP951056516, TA, KP587785252 * Tt);
|
||||
TG = FNMS(KP951056516, TF, KP587785252 * TE);
|
||||
T11 = FMA(KP951056516, TE, KP587785252 * TF);
|
||||
TX = FMA(KP951056516, Tt, KP587785252 * TA);
|
||||
TN = FNMS(KP250000000, TM, TJ);
|
||||
TP = TN - TO;
|
||||
T10 = TO + TN;
|
||||
Tk = FNMS(KP250000000, Ti, T3);
|
||||
Tm = Tk - Tl;
|
||||
TW = Tl + Tk;
|
||||
{
|
||||
E TC, TQ, Tj, TD;
|
||||
TC = Tm - TB;
|
||||
TQ = TG + TP;
|
||||
Tj = W[2];
|
||||
TD = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(TD, TQ, Tj * TC);
|
||||
ci[WS(rs, 2)] = FMA(TD, TC, Tj * TQ);
|
||||
}
|
||||
{
|
||||
E T14, T16, T13, T15;
|
||||
T14 = TW - TX;
|
||||
T16 = T11 + T10;
|
||||
T13 = W[10];
|
||||
T15 = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(T15, T16, T13 * T14);
|
||||
ci[WS(rs, 6)] = FMA(T15, T14, T13 * T16);
|
||||
}
|
||||
{
|
||||
E TS, TU, TR, TT;
|
||||
TS = Tm + TB;
|
||||
TU = TP - TG;
|
||||
TR = W[14];
|
||||
TT = W[15];
|
||||
cr[WS(rs, 8)] = FNMS(TT, TU, TR * TS);
|
||||
ci[WS(rs, 8)] = FMA(TT, TS, TR * TU);
|
||||
}
|
||||
{
|
||||
E TY, T12, TV, TZ;
|
||||
TY = TW + TX;
|
||||
T12 = T10 - T11;
|
||||
TV = W[6];
|
||||
TZ = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(TZ, T12, TV * TY);
|
||||
ci[WS(rs, 4)] = FMA(TZ, TY, TV * T12);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1x, T1C, T1Q, T1N, T1F, T1R, T1u, T1M, T1D, T1s;
|
||||
T1x = FNMS(KP951056516, T1w, KP587785252 * T1v);
|
||||
T1C = FNMS(KP951056516, T1B, KP587785252 * T1A);
|
||||
T1Q = FMA(KP951056516, T1A, KP587785252 * T1B);
|
||||
T1N = FMA(KP951056516, T1v, KP587785252 * T1w);
|
||||
T1D = FNMS(KP250000000, T1p, T1i);
|
||||
T1F = T1D - T1E;
|
||||
T1R = T1E + T1D;
|
||||
T1s = FNMS(KP250000000, T1f, T18);
|
||||
T1u = T1s - T1t;
|
||||
T1M = T1t + T1s;
|
||||
{
|
||||
E T1y, T1G, T1r, T1z;
|
||||
T1y = T1u - T1x;
|
||||
T1G = T1C + T1F;
|
||||
T1r = W[12];
|
||||
T1z = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T1z, T1G, T1r * T1y);
|
||||
ci[WS(rs, 7)] = FMA(T1r, T1G, T1z * T1y);
|
||||
}
|
||||
{
|
||||
E T1U, T1W, T1T, T1V;
|
||||
T1U = T1M + T1N;
|
||||
T1W = T1R - T1Q;
|
||||
T1T = W[16];
|
||||
T1V = W[17];
|
||||
cr[WS(rs, 9)] = FNMS(T1V, T1W, T1T * T1U);
|
||||
ci[WS(rs, 9)] = FMA(T1T, T1W, T1V * T1U);
|
||||
}
|
||||
{
|
||||
E T1I, T1K, T1H, T1J;
|
||||
T1I = T1u + T1x;
|
||||
T1K = T1F - T1C;
|
||||
T1H = W[4];
|
||||
T1J = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(T1J, T1K, T1H * T1I);
|
||||
ci[WS(rs, 3)] = FMA(T1H, T1K, T1J * T1I);
|
||||
}
|
||||
{
|
||||
E T1O, T1S, T1L, T1P;
|
||||
T1O = T1M - T1N;
|
||||
T1S = T1Q + T1R;
|
||||
T1L = W[0];
|
||||
T1P = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T1P, T1S, T1L * T1O);
|
||||
ci[WS(rs, 1)] = FMA(T1L, T1S, T1P * T1O);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 10, "hb_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hb_10) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_10, &desc);
|
||||
}
|
||||
#endif
|
||||
597
fftw-3.3.10/rdft/scalar/r2cb/hb_12.c
Normal file
597
fftw-3.3.10/rdft/scalar/r2cb/hb_12.c
Normal file
@@ -0,0 +1,597 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 68 FP multiplications,
|
||||
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T18, T20, T1b, T21, T1s, T2a, T1p, T29, TI, TN, TO, Tb, To, T1f, T23;
|
||||
E T1i, T24, T1z, T2d, T1w, T2c, Tt, Ty, Tz, Tm, TD;
|
||||
{
|
||||
E T1, TE, TM, T6, T4, T1o, TH, T17, TL, T1a, T9, T1r;
|
||||
T1 = cr[0];
|
||||
TE = ci[WS(rs, 11)];
|
||||
TM = cr[WS(rs, 6)];
|
||||
T6 = ci[WS(rs, 5)];
|
||||
{
|
||||
E T2, T3, TF, TG;
|
||||
T2 = cr[WS(rs, 4)];
|
||||
T3 = ci[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
T1o = T2 - T3;
|
||||
TF = ci[WS(rs, 7)];
|
||||
TG = cr[WS(rs, 8)];
|
||||
TH = TF - TG;
|
||||
T17 = TF + TG;
|
||||
}
|
||||
{
|
||||
E TJ, TK, T7, T8;
|
||||
TJ = ci[WS(rs, 9)];
|
||||
TK = cr[WS(rs, 10)];
|
||||
TL = TJ - TK;
|
||||
T1a = TJ + TK;
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
T1r = T7 - T8;
|
||||
}
|
||||
{
|
||||
E T16, T19, T1q, T1n, T5, Ta;
|
||||
T16 = FNMS(KP500000000, T4, T1);
|
||||
T18 = FNMS(KP866025403, T17, T16);
|
||||
T20 = FMA(KP866025403, T17, T16);
|
||||
T19 = FNMS(KP500000000, T9, T6);
|
||||
T1b = FMA(KP866025403, T1a, T19);
|
||||
T21 = FNMS(KP866025403, T1a, T19);
|
||||
T1q = FMA(KP500000000, TL, TM);
|
||||
T1s = FNMS(KP866025403, T1r, T1q);
|
||||
T2a = FMA(KP866025403, T1r, T1q);
|
||||
T1n = FNMS(KP500000000, TH, TE);
|
||||
T1p = FMA(KP866025403, T1o, T1n);
|
||||
T29 = FNMS(KP866025403, T1o, T1n);
|
||||
TI = TE + TH;
|
||||
TN = TL - TM;
|
||||
TO = TI - TN;
|
||||
T5 = T1 + T4;
|
||||
Ta = T6 + T9;
|
||||
Tb = T5 + Ta;
|
||||
To = T5 - Ta;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tc, Tp, Tx, Th, Tf, T1v, Ts, T1e, Tw, T1h, Tk, T1y;
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Tp = ci[WS(rs, 8)];
|
||||
Tx = cr[WS(rs, 9)];
|
||||
Th = ci[WS(rs, 2)];
|
||||
{
|
||||
E Td, Te, Tq, Tr;
|
||||
Td = ci[WS(rs, 4)];
|
||||
Te = ci[0];
|
||||
Tf = Td + Te;
|
||||
T1v = Td - Te;
|
||||
Tq = cr[WS(rs, 7)];
|
||||
Tr = cr[WS(rs, 11)];
|
||||
Ts = Tq + Tr;
|
||||
T1e = Tq - Tr;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, Ti, Tj;
|
||||
Tu = ci[WS(rs, 10)];
|
||||
Tv = ci[WS(rs, 6)];
|
||||
Tw = Tu + Tv;
|
||||
T1h = Tv - Tu;
|
||||
Ti = cr[WS(rs, 1)];
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
T1y = Ti - Tj;
|
||||
}
|
||||
{
|
||||
E T1d, T1g, T1x, T1u, Tg, Tl;
|
||||
T1d = FNMS(KP500000000, Tf, Tc);
|
||||
T1f = FMA(KP866025403, T1e, T1d);
|
||||
T23 = FNMS(KP866025403, T1e, T1d);
|
||||
T1g = FNMS(KP500000000, Tk, Th);
|
||||
T1i = FMA(KP866025403, T1h, T1g);
|
||||
T24 = FNMS(KP866025403, T1h, T1g);
|
||||
T1x = FMA(KP500000000, Tw, Tx);
|
||||
T1z = FNMS(KP866025403, T1y, T1x);
|
||||
T2d = FMA(KP866025403, T1y, T1x);
|
||||
T1u = FMA(KP500000000, Ts, Tp);
|
||||
T1w = FMA(KP866025403, T1v, T1u);
|
||||
T2c = FNMS(KP866025403, T1v, T1u);
|
||||
Tt = Tp - Ts;
|
||||
Ty = Tw - Tx;
|
||||
Tz = Tt - Ty;
|
||||
Tg = Tc + Tf;
|
||||
Tl = Th + Tk;
|
||||
Tm = Tg + Tl;
|
||||
TD = Tg - Tl;
|
||||
}
|
||||
}
|
||||
cr[0] = Tb + Tm;
|
||||
{
|
||||
E TA, TP, TB, TQ, Tn, TC;
|
||||
TA = To - Tz;
|
||||
TP = TD + TO;
|
||||
Tn = W[16];
|
||||
TB = Tn * TA;
|
||||
TQ = Tn * TP;
|
||||
TC = W[17];
|
||||
cr[WS(rs, 9)] = FNMS(TC, TP, TB);
|
||||
ci[WS(rs, 9)] = FMA(TC, TA, TQ);
|
||||
}
|
||||
{
|
||||
E TS, TV, TT, TW, TR, TU;
|
||||
TS = To + Tz;
|
||||
TV = TO - TD;
|
||||
TR = W[4];
|
||||
TT = TR * TS;
|
||||
TW = TR * TV;
|
||||
TU = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TU, TV, TT);
|
||||
ci[WS(rs, 3)] = FMA(TU, TS, TW);
|
||||
}
|
||||
{
|
||||
E T11, T12, T13, TX, TZ, T10, T14, TY;
|
||||
T11 = TI + TN;
|
||||
T12 = Tt + Ty;
|
||||
T13 = T11 - T12;
|
||||
TY = Tb - Tm;
|
||||
TX = W[10];
|
||||
TZ = TX * TY;
|
||||
T10 = W[11];
|
||||
T14 = T10 * TY;
|
||||
ci[0] = T11 + T12;
|
||||
ci[WS(rs, 6)] = FMA(TX, T13, T14);
|
||||
cr[WS(rs, 6)] = FNMS(T10, T13, TZ);
|
||||
}
|
||||
{
|
||||
E T1k, T1E, T1B, T1H;
|
||||
{
|
||||
E T1c, T1j, T1t, T1A;
|
||||
T1c = T18 + T1b;
|
||||
T1j = T1f + T1i;
|
||||
T1k = T1c - T1j;
|
||||
T1E = T1c + T1j;
|
||||
T1t = T1p - T1s;
|
||||
T1A = T1w - T1z;
|
||||
T1B = T1t - T1A;
|
||||
T1H = T1t + T1A;
|
||||
}
|
||||
{
|
||||
E T15, T1l, T1m, T1C;
|
||||
T15 = W[18];
|
||||
T1l = T15 * T1k;
|
||||
T1m = W[19];
|
||||
T1C = T1m * T1k;
|
||||
cr[WS(rs, 10)] = FNMS(T1m, T1B, T1l);
|
||||
ci[WS(rs, 10)] = FMA(T15, T1B, T1C);
|
||||
}
|
||||
{
|
||||
E T1D, T1F, T1G, T1I;
|
||||
T1D = W[6];
|
||||
T1F = T1D * T1E;
|
||||
T1G = W[7];
|
||||
T1I = T1G * T1E;
|
||||
cr[WS(rs, 4)] = FNMS(T1G, T1H, T1F);
|
||||
ci[WS(rs, 4)] = FMA(T1D, T1H, T1I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T26, T2i, T2f, T2l;
|
||||
{
|
||||
E T22, T25, T2b, T2e;
|
||||
T22 = T20 + T21;
|
||||
T25 = T23 + T24;
|
||||
T26 = T22 - T25;
|
||||
T2i = T22 + T25;
|
||||
T2b = T29 - T2a;
|
||||
T2e = T2c - T2d;
|
||||
T2f = T2b - T2e;
|
||||
T2l = T2b + T2e;
|
||||
}
|
||||
{
|
||||
E T1Z, T27, T28, T2g;
|
||||
T1Z = W[2];
|
||||
T27 = T1Z * T26;
|
||||
T28 = W[3];
|
||||
T2g = T28 * T26;
|
||||
cr[WS(rs, 2)] = FNMS(T28, T2f, T27);
|
||||
ci[WS(rs, 2)] = FMA(T1Z, T2f, T2g);
|
||||
}
|
||||
{
|
||||
E T2h, T2j, T2k, T2m;
|
||||
T2h = W[14];
|
||||
T2j = T2h * T2i;
|
||||
T2k = W[15];
|
||||
T2m = T2k * T2i;
|
||||
cr[WS(rs, 8)] = FNMS(T2k, T2l, T2j);
|
||||
ci[WS(rs, 8)] = FMA(T2h, T2l, T2m);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2q, T2y, T2v, T2B;
|
||||
{
|
||||
E T2o, T2p, T2t, T2u;
|
||||
T2o = T20 - T21;
|
||||
T2p = T2c + T2d;
|
||||
T2q = T2o - T2p;
|
||||
T2y = T2o + T2p;
|
||||
T2t = T29 + T2a;
|
||||
T2u = T23 - T24;
|
||||
T2v = T2t + T2u;
|
||||
T2B = T2t - T2u;
|
||||
}
|
||||
{
|
||||
E T2r, T2w, T2n, T2s;
|
||||
T2n = W[8];
|
||||
T2r = T2n * T2q;
|
||||
T2w = T2n * T2v;
|
||||
T2s = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(T2s, T2v, T2r);
|
||||
ci[WS(rs, 5)] = FMA(T2s, T2q, T2w);
|
||||
}
|
||||
{
|
||||
E T2z, T2C, T2x, T2A;
|
||||
T2x = W[20];
|
||||
T2z = T2x * T2y;
|
||||
T2C = T2x * T2B;
|
||||
T2A = W[21];
|
||||
cr[WS(rs, 11)] = FNMS(T2A, T2B, T2z);
|
||||
ci[WS(rs, 11)] = FMA(T2A, T2y, T2C);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1M, T1U, T1R, T1X;
|
||||
{
|
||||
E T1K, T1L, T1P, T1Q;
|
||||
T1K = T18 - T1b;
|
||||
T1L = T1w + T1z;
|
||||
T1M = T1K - T1L;
|
||||
T1U = T1K + T1L;
|
||||
T1P = T1p + T1s;
|
||||
T1Q = T1f - T1i;
|
||||
T1R = T1P + T1Q;
|
||||
T1X = T1P - T1Q;
|
||||
}
|
||||
{
|
||||
E T1N, T1S, T1J, T1O;
|
||||
T1J = W[0];
|
||||
T1N = T1J * T1M;
|
||||
T1S = T1J * T1R;
|
||||
T1O = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T1O, T1R, T1N);
|
||||
ci[WS(rs, 1)] = FMA(T1O, T1M, T1S);
|
||||
}
|
||||
{
|
||||
E T1V, T1Y, T1T, T1W;
|
||||
T1T = W[12];
|
||||
T1V = T1T * T1U;
|
||||
T1Y = T1T * T1X;
|
||||
T1W = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T1W, T1X, T1V);
|
||||
ci[WS(rs, 7)] = FMA(T1W, T1U, T1Y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
|
||||
|
||||
void X(codelet_hb_12) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_12, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 60 FP multiplications,
|
||||
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 39 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T5, TH, T12, T1M, T1i, T1U, Tg, Tt, T19, T1X, T1p, T1P, Ta, TM, T15;
|
||||
E T1N, T1l, T1V, Tl, Ty, T1c, T1Y, T1s, T1Q;
|
||||
{
|
||||
E T1, TD, T4, T1g, TG, T11, T10, T1h;
|
||||
T1 = cr[0];
|
||||
TD = ci[WS(rs, 11)];
|
||||
{
|
||||
E T2, T3, TE, TF;
|
||||
T2 = cr[WS(rs, 4)];
|
||||
T3 = ci[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
T1g = KP866025403 * (T2 - T3);
|
||||
TE = ci[WS(rs, 7)];
|
||||
TF = cr[WS(rs, 8)];
|
||||
TG = TE - TF;
|
||||
T11 = KP866025403 * (TE + TF);
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
TH = TD + TG;
|
||||
T10 = FNMS(KP500000000, T4, T1);
|
||||
T12 = T10 - T11;
|
||||
T1M = T10 + T11;
|
||||
T1h = FNMS(KP500000000, TG, TD);
|
||||
T1i = T1g + T1h;
|
||||
T1U = T1h - T1g;
|
||||
}
|
||||
{
|
||||
E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Tp = ci[WS(rs, 8)];
|
||||
{
|
||||
E Td, Te, Tq, Tr;
|
||||
Td = ci[WS(rs, 4)];
|
||||
Te = ci[0];
|
||||
Tf = Td + Te;
|
||||
T17 = KP866025403 * (Td - Te);
|
||||
Tq = cr[WS(rs, 7)];
|
||||
Tr = cr[WS(rs, 11)];
|
||||
Ts = Tq + Tr;
|
||||
T1o = KP866025403 * (Tq - Tr);
|
||||
}
|
||||
Tg = Tc + Tf;
|
||||
Tt = Tp - Ts;
|
||||
T18 = FMA(KP500000000, Ts, Tp);
|
||||
T19 = T17 + T18;
|
||||
T1X = T18 - T17;
|
||||
T1n = FNMS(KP500000000, Tf, Tc);
|
||||
T1p = T1n + T1o;
|
||||
T1P = T1n - T1o;
|
||||
}
|
||||
{
|
||||
E T6, TL, T9, T1j, TK, T14, T13, T1k;
|
||||
T6 = ci[WS(rs, 5)];
|
||||
TL = cr[WS(rs, 6)];
|
||||
{
|
||||
E T7, T8, TI, TJ;
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
T1j = KP866025403 * (T7 - T8);
|
||||
TI = ci[WS(rs, 9)];
|
||||
TJ = cr[WS(rs, 10)];
|
||||
TK = TI - TJ;
|
||||
T14 = KP866025403 * (TI + TJ);
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TM = TK - TL;
|
||||
T13 = FNMS(KP500000000, T9, T6);
|
||||
T15 = T13 + T14;
|
||||
T1N = T13 - T14;
|
||||
T1k = FMA(KP500000000, TK, TL);
|
||||
T1l = T1j - T1k;
|
||||
T1V = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
|
||||
Th = ci[WS(rs, 2)];
|
||||
Tx = cr[WS(rs, 9)];
|
||||
{
|
||||
E Ti, Tj, Tu, Tv;
|
||||
Ti = cr[WS(rs, 1)];
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
T1a = KP866025403 * (Ti - Tj);
|
||||
Tu = ci[WS(rs, 10)];
|
||||
Tv = ci[WS(rs, 6)];
|
||||
Tw = Tu + Tv;
|
||||
T1r = KP866025403 * (Tv - Tu);
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
Ty = Tw - Tx;
|
||||
T1b = FMA(KP500000000, Tw, Tx);
|
||||
T1c = T1a - T1b;
|
||||
T1Y = T1a + T1b;
|
||||
T1q = FNMS(KP500000000, Tk, Th);
|
||||
T1s = T1q + T1r;
|
||||
T1Q = T1q - T1r;
|
||||
}
|
||||
{
|
||||
E Tb, Tm, TU, TW, TX, TY, TT, TV;
|
||||
Tb = T5 + Ta;
|
||||
Tm = Tg + Tl;
|
||||
TU = Tb - Tm;
|
||||
TW = TH + TM;
|
||||
TX = Tt + Ty;
|
||||
TY = TW - TX;
|
||||
cr[0] = Tb + Tm;
|
||||
ci[0] = TW + TX;
|
||||
TT = W[10];
|
||||
TV = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(TV, TY, TT * TU);
|
||||
ci[WS(rs, 6)] = FMA(TV, TU, TT * TY);
|
||||
}
|
||||
{
|
||||
E TA, TQ, TO, TS;
|
||||
{
|
||||
E To, Tz, TC, TN;
|
||||
To = T5 - Ta;
|
||||
Tz = Tt - Ty;
|
||||
TA = To - Tz;
|
||||
TQ = To + Tz;
|
||||
TC = Tg - Tl;
|
||||
TN = TH - TM;
|
||||
TO = TC + TN;
|
||||
TS = TN - TC;
|
||||
}
|
||||
{
|
||||
E Tn, TB, TP, TR;
|
||||
Tn = W[16];
|
||||
TB = W[17];
|
||||
cr[WS(rs, 9)] = FNMS(TB, TO, Tn * TA);
|
||||
ci[WS(rs, 9)] = FMA(Tn, TO, TB * TA);
|
||||
TP = W[4];
|
||||
TR = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TR, TS, TP * TQ);
|
||||
ci[WS(rs, 3)] = FMA(TP, TS, TR * TQ);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T28, T2e, T2c, T2g;
|
||||
{
|
||||
E T26, T27, T2a, T2b;
|
||||
T26 = T1M - T1N;
|
||||
T27 = T1X + T1Y;
|
||||
T28 = T26 - T27;
|
||||
T2e = T26 + T27;
|
||||
T2a = T1U + T1V;
|
||||
T2b = T1P - T1Q;
|
||||
T2c = T2a + T2b;
|
||||
T2g = T2a - T2b;
|
||||
}
|
||||
{
|
||||
E T25, T29, T2d, T2f;
|
||||
T25 = W[8];
|
||||
T29 = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(T29, T2c, T25 * T28);
|
||||
ci[WS(rs, 5)] = FMA(T25, T2c, T29 * T28);
|
||||
T2d = W[20];
|
||||
T2f = W[21];
|
||||
cr[WS(rs, 11)] = FNMS(T2f, T2g, T2d * T2e);
|
||||
ci[WS(rs, 11)] = FMA(T2d, T2g, T2f * T2e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1S, T22, T20, T24;
|
||||
{
|
||||
E T1O, T1R, T1W, T1Z;
|
||||
T1O = T1M + T1N;
|
||||
T1R = T1P + T1Q;
|
||||
T1S = T1O - T1R;
|
||||
T22 = T1O + T1R;
|
||||
T1W = T1U - T1V;
|
||||
T1Z = T1X - T1Y;
|
||||
T20 = T1W - T1Z;
|
||||
T24 = T1W + T1Z;
|
||||
}
|
||||
{
|
||||
E T1L, T1T, T21, T23;
|
||||
T1L = W[2];
|
||||
T1T = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T1T, T20, T1L * T1S);
|
||||
ci[WS(rs, 2)] = FMA(T1T, T1S, T1L * T20);
|
||||
T21 = W[14];
|
||||
T23 = W[15];
|
||||
cr[WS(rs, 8)] = FNMS(T23, T24, T21 * T22);
|
||||
ci[WS(rs, 8)] = FMA(T23, T22, T21 * T24);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1C, T1I, T1G, T1K;
|
||||
{
|
||||
E T1A, T1B, T1E, T1F;
|
||||
T1A = T12 + T15;
|
||||
T1B = T1p + T1s;
|
||||
T1C = T1A - T1B;
|
||||
T1I = T1A + T1B;
|
||||
T1E = T1i + T1l;
|
||||
T1F = T19 + T1c;
|
||||
T1G = T1E - T1F;
|
||||
T1K = T1E + T1F;
|
||||
}
|
||||
{
|
||||
E T1z, T1D, T1H, T1J;
|
||||
T1z = W[18];
|
||||
T1D = W[19];
|
||||
cr[WS(rs, 10)] = FNMS(T1D, T1G, T1z * T1C);
|
||||
ci[WS(rs, 10)] = FMA(T1D, T1C, T1z * T1G);
|
||||
T1H = W[6];
|
||||
T1J = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(T1J, T1K, T1H * T1I);
|
||||
ci[WS(rs, 4)] = FMA(T1J, T1I, T1H * T1K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1e, T1w, T1u, T1y;
|
||||
{
|
||||
E T16, T1d, T1m, T1t;
|
||||
T16 = T12 - T15;
|
||||
T1d = T19 - T1c;
|
||||
T1e = T16 - T1d;
|
||||
T1w = T16 + T1d;
|
||||
T1m = T1i - T1l;
|
||||
T1t = T1p - T1s;
|
||||
T1u = T1m + T1t;
|
||||
T1y = T1m - T1t;
|
||||
}
|
||||
{
|
||||
E TZ, T1f, T1v, T1x;
|
||||
TZ = W[0];
|
||||
T1f = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T1f, T1u, TZ * T1e);
|
||||
ci[WS(rs, 1)] = FMA(TZ, T1u, T1f * T1e);
|
||||
T1v = W[12];
|
||||
T1x = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T1x, T1y, T1v * T1w);
|
||||
ci[WS(rs, 7)] = FMA(T1v, T1y, T1x * T1w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hb_12) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_12, &desc);
|
||||
}
|
||||
#endif
|
||||
810
fftw-3.3.10/rdft/scalar/r2cb/hb_15.c
Normal file
810
fftw-3.3.10/rdft/scalar/r2cb/hb_15.c
Normal file
@@ -0,0 +1,810 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:51 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 140 FP multiplications,
|
||||
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
|
||||
* 78 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T5, T11, T1C, T2U, T2f, T3f, TH, T19, T18, TS, T12, T13, T14, T3a, T3g;
|
||||
E Ts, Tv, T37, T3h, T28, T2h, T21, T2g, T2V, T2W, T2X, T2Y, T2Z, T30, T31;
|
||||
E T1F, T1I, T1J, T1M, T1P, T1Q, T1R;
|
||||
{
|
||||
E T1, TX, T4, T2e, T10, T1B, T1A, T2d;
|
||||
T1 = cr[0];
|
||||
TX = ci[WS(rs, 14)];
|
||||
{
|
||||
E T2, T3, TY, TZ;
|
||||
T2 = cr[WS(rs, 5)];
|
||||
T3 = ci[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
T2e = T2 - T3;
|
||||
TY = ci[WS(rs, 9)];
|
||||
TZ = cr[WS(rs, 10)];
|
||||
T10 = TY - TZ;
|
||||
T1B = TY + TZ;
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
T11 = TX + T10;
|
||||
T1A = FNMS(KP500000000, T4, T1);
|
||||
T1C = FNMS(KP866025403, T1B, T1A);
|
||||
T2U = FMA(KP866025403, T1B, T1A);
|
||||
T2d = FNMS(KP500000000, T10, TX);
|
||||
T2f = FMA(KP866025403, T2e, T2d);
|
||||
T3f = FNMS(KP866025403, T2e, T2d);
|
||||
}
|
||||
{
|
||||
E Ta, T1W, T1D, Tl, T23, T1K, Tf, T1Z, T1G, TR, T1Y, T1H, Tq, T26, T1N;
|
||||
E TG, T25, T1O, TM, T1V, T1E, TB, T22, T1L, T38, T39;
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = cr[WS(rs, 3)];
|
||||
T7 = ci[WS(rs, 6)];
|
||||
T8 = ci[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
T1W = T7 - T8;
|
||||
T1D = FNMS(KP500000000, T9, T6);
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = cr[WS(rs, 6)];
|
||||
Ti = ci[WS(rs, 3)];
|
||||
Tj = cr[WS(rs, 1)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = Th + Tk;
|
||||
T23 = Ti - Tj;
|
||||
T1K = FNMS(KP500000000, Tk, Th);
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Td, Te;
|
||||
Tb = ci[WS(rs, 2)];
|
||||
Tc = cr[WS(rs, 2)];
|
||||
Td = cr[WS(rs, 7)];
|
||||
Te = Tc + Td;
|
||||
Tf = Tb + Te;
|
||||
T1Z = Tc - Td;
|
||||
T1G = FNMS(KP500000000, Te, Tb);
|
||||
}
|
||||
{
|
||||
E TQ, TN, TO, TP;
|
||||
TQ = cr[WS(rs, 12)];
|
||||
TN = ci[WS(rs, 12)];
|
||||
TO = ci[WS(rs, 7)];
|
||||
TP = TN + TO;
|
||||
TR = TP - TQ;
|
||||
T1Y = FMA(KP500000000, TP, TQ);
|
||||
T1H = TO - TN;
|
||||
}
|
||||
{
|
||||
E Tm, Tn, To, Tp;
|
||||
Tm = ci[WS(rs, 5)];
|
||||
Tn = ci[0];
|
||||
To = cr[WS(rs, 4)];
|
||||
Tp = Tn + To;
|
||||
Tq = Tm + Tp;
|
||||
T26 = Tn - To;
|
||||
T1N = FNMS(KP500000000, Tp, Tm);
|
||||
}
|
||||
{
|
||||
E TF, TC, TD, TE;
|
||||
TF = cr[WS(rs, 9)];
|
||||
TC = ci[WS(rs, 10)];
|
||||
TD = cr[WS(rs, 14)];
|
||||
TE = TC - TD;
|
||||
TG = TE - TF;
|
||||
T25 = FMA(KP500000000, TE, TF);
|
||||
T1O = TC + TD;
|
||||
}
|
||||
{
|
||||
E TI, TJ, TK, TL;
|
||||
TI = ci[WS(rs, 11)];
|
||||
TJ = cr[WS(rs, 8)];
|
||||
TK = cr[WS(rs, 13)];
|
||||
TL = TJ + TK;
|
||||
TM = TI - TL;
|
||||
T1V = FMA(KP500000000, TL, TI);
|
||||
T1E = TJ - TK;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, TA;
|
||||
Tx = ci[WS(rs, 8)];
|
||||
Ty = ci[WS(rs, 13)];
|
||||
Tz = cr[WS(rs, 11)];
|
||||
TA = Ty - Tz;
|
||||
TB = Tx + TA;
|
||||
T22 = FNMS(KP500000000, TA, Tx);
|
||||
T1L = Ty + Tz;
|
||||
}
|
||||
TH = TB - TG;
|
||||
T19 = Ta - Tf;
|
||||
T18 = Tl - Tq;
|
||||
TS = TM - TR;
|
||||
T12 = TM + TR;
|
||||
T13 = TB + TG;
|
||||
T14 = T12 + T13;
|
||||
T38 = FNMS(KP866025403, T1W, T1V);
|
||||
T39 = FMA(KP866025403, T1Z, T1Y);
|
||||
T3a = T38 + T39;
|
||||
T3g = T38 - T39;
|
||||
{
|
||||
E Tg, Tr, T1X, T20;
|
||||
Tg = Ta + Tf;
|
||||
Tr = Tl + Tq;
|
||||
Ts = Tg + Tr;
|
||||
Tv = Tg - Tr;
|
||||
{
|
||||
E T35, T36, T24, T27;
|
||||
T35 = FNMS(KP866025403, T23, T22);
|
||||
T36 = FMA(KP866025403, T26, T25);
|
||||
T37 = T35 + T36;
|
||||
T3h = T35 - T36;
|
||||
T24 = FMA(KP866025403, T23, T22);
|
||||
T27 = FNMS(KP866025403, T26, T25);
|
||||
T28 = T24 + T27;
|
||||
T2h = T24 - T27;
|
||||
}
|
||||
T1X = FMA(KP866025403, T1W, T1V);
|
||||
T20 = FNMS(KP866025403, T1Z, T1Y);
|
||||
T21 = T1X + T20;
|
||||
T2g = T1X - T20;
|
||||
T2V = FNMS(KP866025403, T1E, T1D);
|
||||
T2W = FNMS(KP866025403, T1H, T1G);
|
||||
T2X = T2V + T2W;
|
||||
T2Y = FNMS(KP866025403, T1L, T1K);
|
||||
T2Z = FNMS(KP866025403, T1O, T1N);
|
||||
T30 = T2Y + T2Z;
|
||||
T31 = T2X + T30;
|
||||
T1F = FMA(KP866025403, T1E, T1D);
|
||||
T1I = FMA(KP866025403, T1H, T1G);
|
||||
T1J = T1F + T1I;
|
||||
T1M = FMA(KP866025403, T1L, T1K);
|
||||
T1P = FMA(KP866025403, T1O, T1N);
|
||||
T1Q = T1M + T1P;
|
||||
T1R = T1J + T1Q;
|
||||
}
|
||||
}
|
||||
cr[0] = T5 + Ts;
|
||||
ci[0] = T11 + T14;
|
||||
{
|
||||
E T1a, T1q, T17, T1p, TU, T1u, T1e, T1m, T15, T16;
|
||||
T1a = FNMS(KP618033988, T19, T18);
|
||||
T1q = FMA(KP618033988, T18, T19);
|
||||
T15 = FNMS(KP250000000, T14, T11);
|
||||
T16 = T12 - T13;
|
||||
T17 = FNMS(KP559016994, T16, T15);
|
||||
T1p = FMA(KP559016994, T16, T15);
|
||||
{
|
||||
E TT, T1l, Tw, T1k, Tu;
|
||||
TT = FNMS(KP618033988, TS, TH);
|
||||
T1l = FMA(KP618033988, TH, TS);
|
||||
Tu = FNMS(KP250000000, Ts, T5);
|
||||
Tw = FNMS(KP559016994, Tv, Tu);
|
||||
T1k = FMA(KP559016994, Tv, Tu);
|
||||
TU = FNMS(KP951056516, TT, Tw);
|
||||
T1u = FMA(KP951056516, T1l, T1k);
|
||||
T1e = FMA(KP951056516, TT, Tw);
|
||||
T1m = FNMS(KP951056516, T1l, T1k);
|
||||
}
|
||||
{
|
||||
E T1b, TW, T1c, Tt, TV;
|
||||
T1b = FMA(KP951056516, T1a, T17);
|
||||
TW = W[5];
|
||||
T1c = TW * TU;
|
||||
Tt = W[4];
|
||||
TV = Tt * TU;
|
||||
cr[WS(rs, 3)] = FNMS(TW, T1b, TV);
|
||||
ci[WS(rs, 3)] = FMA(Tt, T1b, T1c);
|
||||
}
|
||||
{
|
||||
E T1x, T1w, T1y, T1t, T1v;
|
||||
T1x = FNMS(KP951056516, T1q, T1p);
|
||||
T1w = W[17];
|
||||
T1y = T1w * T1u;
|
||||
T1t = W[16];
|
||||
T1v = T1t * T1u;
|
||||
cr[WS(rs, 9)] = FNMS(T1w, T1x, T1v);
|
||||
ci[WS(rs, 9)] = FMA(T1t, T1x, T1y);
|
||||
}
|
||||
{
|
||||
E T1h, T1g, T1i, T1d, T1f;
|
||||
T1h = FNMS(KP951056516, T1a, T17);
|
||||
T1g = W[23];
|
||||
T1i = T1g * T1e;
|
||||
T1d = W[22];
|
||||
T1f = T1d * T1e;
|
||||
cr[WS(rs, 12)] = FNMS(T1g, T1h, T1f);
|
||||
ci[WS(rs, 12)] = FMA(T1d, T1h, T1i);
|
||||
}
|
||||
{
|
||||
E T1r, T1o, T1s, T1j, T1n;
|
||||
T1r = FMA(KP951056516, T1q, T1p);
|
||||
T1o = W[11];
|
||||
T1s = T1o * T1m;
|
||||
T1j = W[10];
|
||||
T1n = T1j * T1m;
|
||||
cr[WS(rs, 6)] = FNMS(T1o, T1r, T1n);
|
||||
ci[WS(rs, 6)] = FMA(T1j, T1r, T1s);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2o, T2E, T2N, T2P, T2Q, T2S, T2l, T2R, T2D, T2a, T2I, T2s, T2A;
|
||||
{
|
||||
E T2m, T2n, T2O, T2k, T2i, T2j;
|
||||
T2m = T1F - T1I;
|
||||
T2n = T1M - T1P;
|
||||
T2o = FMA(KP618033988, T2n, T2m);
|
||||
T2E = FNMS(KP618033988, T2m, T2n);
|
||||
T2O = T1C + T1R;
|
||||
T2N = W[18];
|
||||
T2P = T2N * T2O;
|
||||
T2Q = W[19];
|
||||
T2S = T2Q * T2O;
|
||||
T2k = T2g - T2h;
|
||||
T2i = T2g + T2h;
|
||||
T2j = FNMS(KP250000000, T2i, T2f);
|
||||
T2l = FMA(KP559016994, T2k, T2j);
|
||||
T2R = T2f + T2i;
|
||||
T2D = FNMS(KP559016994, T2k, T2j);
|
||||
{
|
||||
E T29, T2z, T1U, T2y, T1S, T1T;
|
||||
T29 = FMA(KP618033988, T28, T21);
|
||||
T2z = FNMS(KP618033988, T21, T28);
|
||||
T1S = FNMS(KP250000000, T1R, T1C);
|
||||
T1T = T1J - T1Q;
|
||||
T1U = FMA(KP559016994, T1T, T1S);
|
||||
T2y = FNMS(KP559016994, T1T, T1S);
|
||||
T2a = FNMS(KP951056516, T29, T1U);
|
||||
T2I = FNMS(KP951056516, T2z, T2y);
|
||||
T2s = FMA(KP951056516, T29, T1U);
|
||||
T2A = FMA(KP951056516, T2z, T2y);
|
||||
}
|
||||
}
|
||||
cr[WS(rs, 10)] = FNMS(T2Q, T2R, T2P);
|
||||
ci[WS(rs, 10)] = FMA(T2N, T2R, T2S);
|
||||
{
|
||||
E T2p, T2c, T2q, T1z, T2b;
|
||||
T2p = FMA(KP951056516, T2o, T2l);
|
||||
T2c = W[1];
|
||||
T2q = T2c * T2a;
|
||||
T1z = W[0];
|
||||
T2b = T1z * T2a;
|
||||
cr[WS(rs, 1)] = FNMS(T2c, T2p, T2b);
|
||||
ci[WS(rs, 1)] = FMA(T1z, T2p, T2q);
|
||||
}
|
||||
{
|
||||
E T2L, T2K, T2M, T2H, T2J;
|
||||
T2L = FMA(KP951056516, T2E, T2D);
|
||||
T2K = W[25];
|
||||
T2M = T2K * T2I;
|
||||
T2H = W[24];
|
||||
T2J = T2H * T2I;
|
||||
cr[WS(rs, 13)] = FNMS(T2K, T2L, T2J);
|
||||
ci[WS(rs, 13)] = FMA(T2H, T2L, T2M);
|
||||
}
|
||||
{
|
||||
E T2F, T2C, T2G, T2x, T2B;
|
||||
T2F = FNMS(KP951056516, T2E, T2D);
|
||||
T2C = W[13];
|
||||
T2G = T2C * T2A;
|
||||
T2x = W[12];
|
||||
T2B = T2x * T2A;
|
||||
cr[WS(rs, 7)] = FNMS(T2C, T2F, T2B);
|
||||
ci[WS(rs, 7)] = FMA(T2x, T2F, T2G);
|
||||
}
|
||||
{
|
||||
E T2v, T2u, T2w, T2r, T2t;
|
||||
T2v = FNMS(KP951056516, T2o, T2l);
|
||||
T2u = W[7];
|
||||
T2w = T2u * T2s;
|
||||
T2r = W[6];
|
||||
T2t = T2r * T2s;
|
||||
cr[WS(rs, 4)] = FNMS(T2u, T2v, T2t);
|
||||
ci[WS(rs, 4)] = FMA(T2r, T2v, T2w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3o, T3E, T3N, T3P, T3Q, T3S, T3l, T3R, T3D, T3c, T3I, T3s, T3A;
|
||||
{
|
||||
E T3m, T3n, T3O, T3k, T3i, T3j;
|
||||
T3m = T2Y - T2Z;
|
||||
T3n = T2V - T2W;
|
||||
T3o = FNMS(KP618033988, T3n, T3m);
|
||||
T3E = FMA(KP618033988, T3m, T3n);
|
||||
T3O = T2U + T31;
|
||||
T3N = W[8];
|
||||
T3P = T3N * T3O;
|
||||
T3Q = W[9];
|
||||
T3S = T3Q * T3O;
|
||||
T3k = T3g - T3h;
|
||||
T3i = T3g + T3h;
|
||||
T3j = FNMS(KP250000000, T3i, T3f);
|
||||
T3l = FNMS(KP559016994, T3k, T3j);
|
||||
T3R = T3f + T3i;
|
||||
T3D = FMA(KP559016994, T3k, T3j);
|
||||
{
|
||||
E T3b, T3z, T34, T3y, T32, T33;
|
||||
T3b = FNMS(KP618033988, T3a, T37);
|
||||
T3z = FMA(KP618033988, T37, T3a);
|
||||
T32 = FNMS(KP250000000, T31, T2U);
|
||||
T33 = T2X - T30;
|
||||
T34 = FNMS(KP559016994, T33, T32);
|
||||
T3y = FMA(KP559016994, T33, T32);
|
||||
T3c = FMA(KP951056516, T3b, T34);
|
||||
T3I = FMA(KP951056516, T3z, T3y);
|
||||
T3s = FNMS(KP951056516, T3b, T34);
|
||||
T3A = FNMS(KP951056516, T3z, T3y);
|
||||
}
|
||||
}
|
||||
cr[WS(rs, 5)] = FNMS(T3Q, T3R, T3P);
|
||||
ci[WS(rs, 5)] = FMA(T3N, T3R, T3S);
|
||||
{
|
||||
E T3p, T3e, T3q, T2T, T3d;
|
||||
T3p = FNMS(KP951056516, T3o, T3l);
|
||||
T3e = W[3];
|
||||
T3q = T3e * T3c;
|
||||
T2T = W[2];
|
||||
T3d = T2T * T3c;
|
||||
cr[WS(rs, 2)] = FNMS(T3e, T3p, T3d);
|
||||
ci[WS(rs, 2)] = FMA(T2T, T3p, T3q);
|
||||
}
|
||||
{
|
||||
E T3L, T3K, T3M, T3H, T3J;
|
||||
T3L = FNMS(KP951056516, T3E, T3D);
|
||||
T3K = W[27];
|
||||
T3M = T3K * T3I;
|
||||
T3H = W[26];
|
||||
T3J = T3H * T3I;
|
||||
cr[WS(rs, 14)] = FNMS(T3K, T3L, T3J);
|
||||
ci[WS(rs, 14)] = FMA(T3H, T3L, T3M);
|
||||
}
|
||||
{
|
||||
E T3F, T3C, T3G, T3x, T3B;
|
||||
T3F = FMA(KP951056516, T3E, T3D);
|
||||
T3C = W[21];
|
||||
T3G = T3C * T3A;
|
||||
T3x = W[20];
|
||||
T3B = T3x * T3A;
|
||||
cr[WS(rs, 11)] = FNMS(T3C, T3F, T3B);
|
||||
ci[WS(rs, 11)] = FMA(T3x, T3F, T3G);
|
||||
}
|
||||
{
|
||||
E T3v, T3u, T3w, T3r, T3t;
|
||||
T3v = FMA(KP951056516, T3o, T3l);
|
||||
T3u = W[15];
|
||||
T3w = T3u * T3s;
|
||||
T3r = W[14];
|
||||
T3t = T3r * T3s;
|
||||
cr[WS(rs, 8)] = FNMS(T3u, T3v, T3t);
|
||||
ci[WS(rs, 8)] = FMA(T3r, T3v, T3w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, { 72, 28, 112, 0 } };
|
||||
|
||||
void X(codelet_hb_15) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_15, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 112 FP multiplications,
|
||||
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
|
||||
* 75 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T5, T10, T1J, T2C, T2c, T2M, TH, T18, T17, TS, T2Q, T2R, T2S, Tg, Tr;
|
||||
E Ts, T11, T12, T13, T2N, T2O, T2P, T1u, T1x, T1y, T1W, T1Z, T28, T1P, T1S;
|
||||
E T27, T1B, T1E, T1F, T2G, T2H, T2I, T2D, T2E, T2F;
|
||||
{
|
||||
E T1, TW, T4, T2a, TZ, T1I, T1H, T2b;
|
||||
T1 = cr[0];
|
||||
TW = ci[WS(rs, 14)];
|
||||
{
|
||||
E T2, T3, TX, TY;
|
||||
T2 = cr[WS(rs, 5)];
|
||||
T3 = ci[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
T2a = KP866025403 * (T2 - T3);
|
||||
TX = ci[WS(rs, 9)];
|
||||
TY = cr[WS(rs, 10)];
|
||||
TZ = TX - TY;
|
||||
T1I = KP866025403 * (TX + TY);
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
T10 = TW + TZ;
|
||||
T1H = FNMS(KP500000000, T4, T1);
|
||||
T1J = T1H - T1I;
|
||||
T2C = T1H + T1I;
|
||||
T2b = FNMS(KP500000000, TZ, TW);
|
||||
T2c = T2a + T2b;
|
||||
T2M = T2b - T2a;
|
||||
}
|
||||
{
|
||||
E Ta, T1N, T1s, Tl, T1U, T1z, Tf, T1Q, T1v, TG, T1R, T1w, Tq, T1X, T1C;
|
||||
E TM, T1V, T1A, TB, T1O, T1t, TR, T1Y, T1D;
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = cr[WS(rs, 3)];
|
||||
T7 = ci[WS(rs, 6)];
|
||||
T8 = ci[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
T1N = KP866025403 * (T7 - T8);
|
||||
T1s = FNMS(KP500000000, T9, T6);
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = cr[WS(rs, 6)];
|
||||
Ti = ci[WS(rs, 3)];
|
||||
Tj = cr[WS(rs, 1)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = Th + Tk;
|
||||
T1U = KP866025403 * (Ti - Tj);
|
||||
T1z = FNMS(KP500000000, Tk, Th);
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Td, Te;
|
||||
Tb = ci[WS(rs, 2)];
|
||||
Tc = cr[WS(rs, 2)];
|
||||
Td = cr[WS(rs, 7)];
|
||||
Te = Tc + Td;
|
||||
Tf = Tb + Te;
|
||||
T1Q = KP866025403 * (Tc - Td);
|
||||
T1v = FNMS(KP500000000, Te, Tb);
|
||||
}
|
||||
{
|
||||
E TF, TC, TD, TE;
|
||||
TF = cr[WS(rs, 12)];
|
||||
TC = ci[WS(rs, 12)];
|
||||
TD = ci[WS(rs, 7)];
|
||||
TE = TC + TD;
|
||||
TG = TE - TF;
|
||||
T1R = FMA(KP500000000, TE, TF);
|
||||
T1w = KP866025403 * (TD - TC);
|
||||
}
|
||||
{
|
||||
E Tm, Tn, To, Tp;
|
||||
Tm = ci[WS(rs, 5)];
|
||||
Tn = ci[0];
|
||||
To = cr[WS(rs, 4)];
|
||||
Tp = Tn + To;
|
||||
Tq = Tm + Tp;
|
||||
T1X = KP866025403 * (Tn - To);
|
||||
T1C = FNMS(KP500000000, Tp, Tm);
|
||||
}
|
||||
{
|
||||
E TI, TJ, TK, TL;
|
||||
TI = ci[WS(rs, 8)];
|
||||
TJ = ci[WS(rs, 13)];
|
||||
TK = cr[WS(rs, 11)];
|
||||
TL = TJ - TK;
|
||||
TM = TI + TL;
|
||||
T1V = FNMS(KP500000000, TL, TI);
|
||||
T1A = KP866025403 * (TJ + TK);
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, TA;
|
||||
Tx = ci[WS(rs, 11)];
|
||||
Ty = cr[WS(rs, 8)];
|
||||
Tz = cr[WS(rs, 13)];
|
||||
TA = Ty + Tz;
|
||||
TB = Tx - TA;
|
||||
T1O = FMA(KP500000000, TA, Tx);
|
||||
T1t = KP866025403 * (Ty - Tz);
|
||||
}
|
||||
{
|
||||
E TQ, TN, TO, TP;
|
||||
TQ = cr[WS(rs, 9)];
|
||||
TN = ci[WS(rs, 10)];
|
||||
TO = cr[WS(rs, 14)];
|
||||
TP = TN - TO;
|
||||
TR = TP - TQ;
|
||||
T1Y = FMA(KP500000000, TP, TQ);
|
||||
T1D = KP866025403 * (TN + TO);
|
||||
}
|
||||
TH = TB - TG;
|
||||
T18 = Tl - Tq;
|
||||
T17 = Ta - Tf;
|
||||
TS = TM - TR;
|
||||
T2Q = T1V - T1U;
|
||||
T2R = T1X + T1Y;
|
||||
T2S = T2Q - T2R;
|
||||
Tg = Ta + Tf;
|
||||
Tr = Tl + Tq;
|
||||
Ts = Tg + Tr;
|
||||
T11 = TB + TG;
|
||||
T12 = TM + TR;
|
||||
T13 = T11 + T12;
|
||||
T2N = T1O - T1N;
|
||||
T2O = T1Q + T1R;
|
||||
T2P = T2N - T2O;
|
||||
T1u = T1s + T1t;
|
||||
T1x = T1v + T1w;
|
||||
T1y = T1u + T1x;
|
||||
T1W = T1U + T1V;
|
||||
T1Z = T1X - T1Y;
|
||||
T28 = T1W + T1Z;
|
||||
T1P = T1N + T1O;
|
||||
T1S = T1Q - T1R;
|
||||
T27 = T1P + T1S;
|
||||
T1B = T1z + T1A;
|
||||
T1E = T1C + T1D;
|
||||
T1F = T1B + T1E;
|
||||
T2G = T1z - T1A;
|
||||
T2H = T1C - T1D;
|
||||
T2I = T2G + T2H;
|
||||
T2D = T1s - T1t;
|
||||
T2E = T1v - T1w;
|
||||
T2F = T2D + T2E;
|
||||
}
|
||||
cr[0] = T5 + Ts;
|
||||
ci[0] = T10 + T13;
|
||||
{
|
||||
E TT, T19, T1k, T1h, T16, T1l, Tw, T1g;
|
||||
TT = FNMS(KP951056516, TS, KP587785252 * TH);
|
||||
T19 = FNMS(KP951056516, T18, KP587785252 * T17);
|
||||
T1k = FMA(KP951056516, T17, KP587785252 * T18);
|
||||
T1h = FMA(KP951056516, TH, KP587785252 * TS);
|
||||
{
|
||||
E T14, T15, Tu, Tv;
|
||||
T14 = FNMS(KP250000000, T13, T10);
|
||||
T15 = KP559016994 * (T11 - T12);
|
||||
T16 = T14 - T15;
|
||||
T1l = T15 + T14;
|
||||
Tu = FNMS(KP250000000, Ts, T5);
|
||||
Tv = KP559016994 * (Tg - Tr);
|
||||
Tw = Tu - Tv;
|
||||
T1g = Tv + Tu;
|
||||
}
|
||||
{
|
||||
E TU, T1a, Tt, TV;
|
||||
TU = Tw + TT;
|
||||
T1a = T16 - T19;
|
||||
Tt = W[4];
|
||||
TV = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TV, T1a, Tt * TU);
|
||||
ci[WS(rs, 3)] = FMA(TV, TU, Tt * T1a);
|
||||
}
|
||||
{
|
||||
E T1o, T1q, T1n, T1p;
|
||||
T1o = T1g + T1h;
|
||||
T1q = T1l - T1k;
|
||||
T1n = W[16];
|
||||
T1p = W[17];
|
||||
cr[WS(rs, 9)] = FNMS(T1p, T1q, T1n * T1o);
|
||||
ci[WS(rs, 9)] = FMA(T1p, T1o, T1n * T1q);
|
||||
}
|
||||
{
|
||||
E T1c, T1e, T1b, T1d;
|
||||
T1c = Tw - TT;
|
||||
T1e = T19 + T16;
|
||||
T1b = W[22];
|
||||
T1d = W[23];
|
||||
cr[WS(rs, 12)] = FNMS(T1d, T1e, T1b * T1c);
|
||||
ci[WS(rs, 12)] = FMA(T1d, T1c, T1b * T1e);
|
||||
}
|
||||
{
|
||||
E T1i, T1m, T1f, T1j;
|
||||
T1i = T1g - T1h;
|
||||
T1m = T1k + T1l;
|
||||
T1f = W[10];
|
||||
T1j = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(T1j, T1m, T1f * T1i);
|
||||
ci[WS(rs, 6)] = FMA(T1j, T1i, T1f * T1m);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T21, T2n, T26, T2q, T1M, T2y, T2m, T2f, T2A, T2r, T2x, T2z;
|
||||
{
|
||||
E T1T, T20, T24, T25;
|
||||
T1T = T1P - T1S;
|
||||
T20 = T1W - T1Z;
|
||||
T21 = FMA(KP951056516, T1T, KP587785252 * T20);
|
||||
T2n = FNMS(KP951056516, T20, KP587785252 * T1T);
|
||||
T24 = T1u - T1x;
|
||||
T25 = T1B - T1E;
|
||||
T26 = FMA(KP951056516, T24, KP587785252 * T25);
|
||||
T2q = FNMS(KP951056516, T25, KP587785252 * T24);
|
||||
}
|
||||
{
|
||||
E T1G, T1K, T1L, T29, T2d, T2e;
|
||||
T1G = KP559016994 * (T1y - T1F);
|
||||
T1K = T1y + T1F;
|
||||
T1L = FNMS(KP250000000, T1K, T1J);
|
||||
T1M = T1G + T1L;
|
||||
T2y = T1J + T1K;
|
||||
T2m = T1L - T1G;
|
||||
T29 = KP559016994 * (T27 - T28);
|
||||
T2d = T27 + T28;
|
||||
T2e = FNMS(KP250000000, T2d, T2c);
|
||||
T2f = T29 + T2e;
|
||||
T2A = T2c + T2d;
|
||||
T2r = T2e - T29;
|
||||
}
|
||||
T2x = W[18];
|
||||
T2z = W[19];
|
||||
cr[WS(rs, 10)] = FNMS(T2z, T2A, T2x * T2y);
|
||||
ci[WS(rs, 10)] = FMA(T2z, T2y, T2x * T2A);
|
||||
{
|
||||
E T2u, T2w, T2t, T2v;
|
||||
T2u = T2m + T2n;
|
||||
T2w = T2r - T2q;
|
||||
T2t = W[24];
|
||||
T2v = W[25];
|
||||
cr[WS(rs, 13)] = FNMS(T2v, T2w, T2t * T2u);
|
||||
ci[WS(rs, 13)] = FMA(T2v, T2u, T2t * T2w);
|
||||
}
|
||||
{
|
||||
E T22, T2g, T1r, T23;
|
||||
T22 = T1M - T21;
|
||||
T2g = T26 + T2f;
|
||||
T1r = W[0];
|
||||
T23 = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T23, T2g, T1r * T22);
|
||||
ci[WS(rs, 1)] = FMA(T23, T22, T1r * T2g);
|
||||
}
|
||||
{
|
||||
E T2i, T2k, T2h, T2j;
|
||||
T2i = T1M + T21;
|
||||
T2k = T2f - T26;
|
||||
T2h = W[6];
|
||||
T2j = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(T2j, T2k, T2h * T2i);
|
||||
ci[WS(rs, 4)] = FMA(T2j, T2i, T2h * T2k);
|
||||
}
|
||||
{
|
||||
E T2o, T2s, T2l, T2p;
|
||||
T2o = T2m - T2n;
|
||||
T2s = T2q + T2r;
|
||||
T2l = W[12];
|
||||
T2p = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T2p, T2s, T2l * T2o);
|
||||
ci[WS(rs, 7)] = FMA(T2p, T2o, T2l * T2s);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T31, T3h, T36, T3k, T2K, T3g, T2Y, T2U, T3l, T39, T2B, T2L;
|
||||
{
|
||||
E T2Z, T30, T34, T35;
|
||||
T2Z = T2N + T2O;
|
||||
T30 = T2Q + T2R;
|
||||
T31 = FNMS(KP951056516, T30, KP587785252 * T2Z);
|
||||
T3h = FMA(KP951056516, T2Z, KP587785252 * T30);
|
||||
T34 = T2D - T2E;
|
||||
T35 = T2G - T2H;
|
||||
T36 = FNMS(KP951056516, T35, KP587785252 * T34);
|
||||
T3k = FMA(KP951056516, T34, KP587785252 * T35);
|
||||
}
|
||||
{
|
||||
E T2X, T2J, T2W, T38, T2T, T37;
|
||||
T2X = KP559016994 * (T2F - T2I);
|
||||
T2J = T2F + T2I;
|
||||
T2W = FNMS(KP250000000, T2J, T2C);
|
||||
T2K = T2C + T2J;
|
||||
T3g = T2X + T2W;
|
||||
T2Y = T2W - T2X;
|
||||
T38 = KP559016994 * (T2P - T2S);
|
||||
T2T = T2P + T2S;
|
||||
T37 = FNMS(KP250000000, T2T, T2M);
|
||||
T2U = T2M + T2T;
|
||||
T3l = T38 + T37;
|
||||
T39 = T37 - T38;
|
||||
}
|
||||
T2B = W[8];
|
||||
T2L = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(T2L, T2U, T2B * T2K);
|
||||
ci[WS(rs, 5)] = FMA(T2L, T2K, T2B * T2U);
|
||||
{
|
||||
E T3o, T3q, T3n, T3p;
|
||||
T3o = T3g + T3h;
|
||||
T3q = T3l - T3k;
|
||||
T3n = W[26];
|
||||
T3p = W[27];
|
||||
cr[WS(rs, 14)] = FNMS(T3p, T3q, T3n * T3o);
|
||||
ci[WS(rs, 14)] = FMA(T3n, T3q, T3p * T3o);
|
||||
}
|
||||
{
|
||||
E T32, T3a, T2V, T33;
|
||||
T32 = T2Y - T31;
|
||||
T3a = T36 + T39;
|
||||
T2V = W[2];
|
||||
T33 = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T33, T3a, T2V * T32);
|
||||
ci[WS(rs, 2)] = FMA(T2V, T3a, T33 * T32);
|
||||
}
|
||||
{
|
||||
E T3c, T3e, T3b, T3d;
|
||||
T3c = T2Y + T31;
|
||||
T3e = T39 - T36;
|
||||
T3b = W[14];
|
||||
T3d = W[15];
|
||||
cr[WS(rs, 8)] = FNMS(T3d, T3e, T3b * T3c);
|
||||
ci[WS(rs, 8)] = FMA(T3b, T3e, T3d * T3c);
|
||||
}
|
||||
{
|
||||
E T3i, T3m, T3f, T3j;
|
||||
T3i = T3g - T3h;
|
||||
T3m = T3k + T3l;
|
||||
T3f = W[20];
|
||||
T3j = W[21];
|
||||
cr[WS(rs, 11)] = FNMS(T3j, T3m, T3f * T3i);
|
||||
ci[WS(rs, 11)] = FMA(T3f, T3m, T3j * T3i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, { 128, 56, 56, 0 } };
|
||||
|
||||
void X(codelet_hb_15) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_15, &desc);
|
||||
}
|
||||
#endif
|
||||
833
fftw-3.3.10/rdft/scalar/r2cb/hb_16.c
Normal file
833
fftw-3.3.10/rdft/scalar/r2cb/hb_16.c
Normal file
@@ -0,0 +1,833 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:51 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 100 FP multiplications,
|
||||
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 63 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E TA, T1O, T21, T1h, T2P, T2S, T3b, T3p, T3q, T3D, T1k, T1P, Tf, T3y, T2A;
|
||||
E T36, TL, T22, T3s, T3t, T3z, T2F, T2U, T2K, T2V, Tu, T3E, TX, T1n, T1T;
|
||||
E T24, T1W, T25, T18, T1m;
|
||||
{
|
||||
E T3, Tw, TJ, T2x, T1g, T2Q, T6, T1d, Ta, TB, Tz, T2R, TE, T2y, Td;
|
||||
E TG;
|
||||
{
|
||||
E T1, T2, TH, TI;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
Tw = T1 - T2;
|
||||
TH = ci[WS(rs, 9)];
|
||||
TI = cr[WS(rs, 14)];
|
||||
TJ = TH + TI;
|
||||
T2x = TH - TI;
|
||||
}
|
||||
{
|
||||
E T1e, T1f, T4, T5;
|
||||
T1e = ci[WS(rs, 15)];
|
||||
T1f = cr[WS(rs, 8)];
|
||||
T1g = T1e + T1f;
|
||||
T2Q = T1e - T1f;
|
||||
T4 = cr[WS(rs, 4)];
|
||||
T5 = ci[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T1d = T4 - T5;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tx, Ty;
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = ci[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TB = T8 - T9;
|
||||
Tx = ci[WS(rs, 11)];
|
||||
Ty = cr[WS(rs, 12)];
|
||||
Tz = Tx + Ty;
|
||||
T2R = Tx - Ty;
|
||||
}
|
||||
{
|
||||
E TC, TD, Tb, Tc;
|
||||
TC = ci[WS(rs, 13)];
|
||||
TD = cr[WS(rs, 10)];
|
||||
TE = TC + TD;
|
||||
T2y = TC - TD;
|
||||
Tb = ci[WS(rs, 1)];
|
||||
Tc = cr[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
TG = Tb - Tc;
|
||||
}
|
||||
TA = Tw - Tz;
|
||||
T1O = Tw + Tz;
|
||||
T21 = T1g - T1d;
|
||||
T1h = T1d + T1g;
|
||||
T2P = Ta - Td;
|
||||
T2S = T2Q - T2R;
|
||||
T3b = T2S - T2P;
|
||||
{
|
||||
E T1i, T1j, T7, Te;
|
||||
T3p = T2Q + T2R;
|
||||
T3q = T2y + T2x;
|
||||
T3D = T3p - T3q;
|
||||
T1i = TB + TE;
|
||||
T1j = TG + TJ;
|
||||
T1k = T1i - T1j;
|
||||
T1P = T1i + T1j;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T3y = T7 - Te;
|
||||
{
|
||||
E T2w, T2z, TF, TK;
|
||||
T2w = T3 - T6;
|
||||
T2z = T2x - T2y;
|
||||
T2A = T2w + T2z;
|
||||
T36 = T2w - T2z;
|
||||
TF = TB - TE;
|
||||
TK = TG - TJ;
|
||||
TL = TF + TK;
|
||||
T22 = TF - TK;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T13, T11, T2C, T16, T2D, Tl, TY, Tp, TS, TQ, T2H, TV, T2I, Ts;
|
||||
E TN, T2B, T2E;
|
||||
{
|
||||
E Tg, Th, TZ, T10;
|
||||
Tg = cr[WS(rs, 1)];
|
||||
Th = ci[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T13 = Tg - Th;
|
||||
TZ = ci[WS(rs, 14)];
|
||||
T10 = cr[WS(rs, 9)];
|
||||
T11 = TZ + T10;
|
||||
T2C = TZ - T10;
|
||||
}
|
||||
{
|
||||
E T14, T15, Tj, Tk;
|
||||
T14 = ci[WS(rs, 10)];
|
||||
T15 = cr[WS(rs, 13)];
|
||||
T16 = T14 + T15;
|
||||
T2D = T14 - T15;
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = ci[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
TY = Tj - Tk;
|
||||
}
|
||||
{
|
||||
E Tn, To, TO, TP;
|
||||
Tn = ci[0];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
TS = Tn - To;
|
||||
TO = ci[WS(rs, 8)];
|
||||
TP = cr[WS(rs, 15)];
|
||||
TQ = TO + TP;
|
||||
T2H = TO - TP;
|
||||
}
|
||||
{
|
||||
E TT, TU, Tq, Tr;
|
||||
TT = ci[WS(rs, 12)];
|
||||
TU = cr[WS(rs, 11)];
|
||||
TV = TT + TU;
|
||||
T2I = TT - TU;
|
||||
Tq = cr[WS(rs, 3)];
|
||||
Tr = ci[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TN = Tq - Tr;
|
||||
}
|
||||
T3s = T2C + T2D;
|
||||
T3t = T2H + T2I;
|
||||
T3z = T3t - T3s;
|
||||
T2B = Ti - Tl;
|
||||
T2E = T2C - T2D;
|
||||
T2F = T2B - T2E;
|
||||
T2U = T2B + T2E;
|
||||
{
|
||||
E T2G, T2J, Tm, Tt;
|
||||
T2G = Tp - Ts;
|
||||
T2J = T2H - T2I;
|
||||
T2K = T2G + T2J;
|
||||
T2V = T2J - T2G;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T3E = Tm - Tt;
|
||||
}
|
||||
{
|
||||
E TR, TW, T1R, T1S;
|
||||
TR = TN - TQ;
|
||||
TW = TS - TV;
|
||||
TX = FNMS(KP414213562, TW, TR);
|
||||
T1n = FMA(KP414213562, TR, TW);
|
||||
T1R = T11 - TY;
|
||||
T1S = T13 + T16;
|
||||
T1T = FNMS(KP414213562, T1S, T1R);
|
||||
T24 = FMA(KP414213562, T1R, T1S);
|
||||
}
|
||||
{
|
||||
E T1U, T1V, T12, T17;
|
||||
T1U = TN + TQ;
|
||||
T1V = TS + TV;
|
||||
T1W = FNMS(KP414213562, T1V, T1U);
|
||||
T25 = FMA(KP414213562, T1U, T1V);
|
||||
T12 = TY + T11;
|
||||
T17 = T13 - T16;
|
||||
T18 = FMA(KP414213562, T17, T12);
|
||||
T1m = FNMS(KP414213562, T12, T17);
|
||||
}
|
||||
}
|
||||
cr[0] = Tf + Tu;
|
||||
{
|
||||
E T3r, T3u, T3v, T3l, T3n, T3o, T3w, T3m;
|
||||
T3r = T3p + T3q;
|
||||
T3u = T3s + T3t;
|
||||
T3v = T3r - T3u;
|
||||
T3m = Tf - Tu;
|
||||
T3l = W[14];
|
||||
T3n = T3l * T3m;
|
||||
T3o = W[15];
|
||||
T3w = T3o * T3m;
|
||||
ci[0] = T3r + T3u;
|
||||
ci[WS(rs, 8)] = FMA(T3l, T3v, T3w);
|
||||
cr[WS(rs, 8)] = FNMS(T3o, T3v, T3n);
|
||||
}
|
||||
{
|
||||
E T3A, T3F, T3B, T3G, T3x, T3C;
|
||||
T3A = T3y - T3z;
|
||||
T3F = T3D - T3E;
|
||||
T3x = W[22];
|
||||
T3B = T3x * T3A;
|
||||
T3G = T3x * T3F;
|
||||
T3C = W[23];
|
||||
cr[WS(rs, 12)] = FNMS(T3C, T3F, T3B);
|
||||
ci[WS(rs, 12)] = FMA(T3C, T3A, T3G);
|
||||
}
|
||||
{
|
||||
E T3I, T3L, T3J, T3M, T3H, T3K;
|
||||
T3I = T3y + T3z;
|
||||
T3L = T3E + T3D;
|
||||
T3H = W[6];
|
||||
T3J = T3H * T3I;
|
||||
T3M = T3H * T3L;
|
||||
T3K = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(T3K, T3L, T3J);
|
||||
ci[WS(rs, 4)] = FMA(T3K, T3I, T3M);
|
||||
}
|
||||
{
|
||||
E T38, T3g, T3d, T3j, T37, T3c;
|
||||
T37 = T2V - T2U;
|
||||
T38 = FNMS(KP707106781, T37, T36);
|
||||
T3g = FMA(KP707106781, T37, T36);
|
||||
T3c = T2F - T2K;
|
||||
T3d = FNMS(KP707106781, T3c, T3b);
|
||||
T3j = FMA(KP707106781, T3c, T3b);
|
||||
{
|
||||
E T39, T3e, T35, T3a;
|
||||
T35 = W[26];
|
||||
T39 = T35 * T38;
|
||||
T3e = T35 * T3d;
|
||||
T3a = W[27];
|
||||
cr[WS(rs, 14)] = FNMS(T3a, T3d, T39);
|
||||
ci[WS(rs, 14)] = FMA(T3a, T38, T3e);
|
||||
}
|
||||
{
|
||||
E T3h, T3k, T3f, T3i;
|
||||
T3f = W[10];
|
||||
T3h = T3f * T3g;
|
||||
T3k = T3f * T3j;
|
||||
T3i = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(T3i, T3j, T3h);
|
||||
ci[WS(rs, 6)] = FMA(T3i, T3g, T3k);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2M, T30, T2X, T33, T2L, T2T, T2W;
|
||||
T2L = T2F + T2K;
|
||||
T2M = FNMS(KP707106781, T2L, T2A);
|
||||
T30 = FMA(KP707106781, T2L, T2A);
|
||||
T2T = T2P + T2S;
|
||||
T2W = T2U + T2V;
|
||||
T2X = FNMS(KP707106781, T2W, T2T);
|
||||
T33 = FMA(KP707106781, T2W, T2T);
|
||||
{
|
||||
E T2v, T2N, T2O, T2Y;
|
||||
T2v = W[18];
|
||||
T2N = T2v * T2M;
|
||||
T2O = W[19];
|
||||
T2Y = T2O * T2M;
|
||||
cr[WS(rs, 10)] = FNMS(T2O, T2X, T2N);
|
||||
ci[WS(rs, 10)] = FMA(T2v, T2X, T2Y);
|
||||
}
|
||||
{
|
||||
E T2Z, T31, T32, T34;
|
||||
T2Z = W[2];
|
||||
T31 = T2Z * T30;
|
||||
T32 = W[3];
|
||||
T34 = T32 * T30;
|
||||
cr[WS(rs, 2)] = FNMS(T32, T33, T31);
|
||||
ci[WS(rs, 2)] = FMA(T2Z, T33, T34);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1Y, T2a, T27, T2d;
|
||||
{
|
||||
E T1Q, T1X, T23, T26;
|
||||
T1Q = FNMS(KP707106781, T1P, T1O);
|
||||
T1X = T1T + T1W;
|
||||
T1Y = FMA(KP923879532, T1X, T1Q);
|
||||
T2a = FNMS(KP923879532, T1X, T1Q);
|
||||
T23 = FMA(KP707106781, T22, T21);
|
||||
T26 = T24 - T25;
|
||||
T27 = FNMS(KP923879532, T26, T23);
|
||||
T2d = FMA(KP923879532, T26, T23);
|
||||
}
|
||||
{
|
||||
E T1N, T1Z, T20, T28;
|
||||
T1N = W[20];
|
||||
T1Z = T1N * T1Y;
|
||||
T20 = W[21];
|
||||
T28 = T20 * T1Y;
|
||||
cr[WS(rs, 11)] = FNMS(T20, T27, T1Z);
|
||||
ci[WS(rs, 11)] = FMA(T1N, T27, T28);
|
||||
}
|
||||
{
|
||||
E T29, T2b, T2c, T2e;
|
||||
T29 = W[4];
|
||||
T2b = T29 * T2a;
|
||||
T2c = W[5];
|
||||
T2e = T2c * T2a;
|
||||
cr[WS(rs, 3)] = FNMS(T2c, T2d, T2b);
|
||||
ci[WS(rs, 3)] = FMA(T29, T2d, T2e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1a, T1s, T1p, T1v;
|
||||
{
|
||||
E TM, T19, T1l, T1o;
|
||||
TM = FNMS(KP707106781, TL, TA);
|
||||
T19 = TX - T18;
|
||||
T1a = FNMS(KP923879532, T19, TM);
|
||||
T1s = FMA(KP923879532, T19, TM);
|
||||
T1l = FNMS(KP707106781, T1k, T1h);
|
||||
T1o = T1m - T1n;
|
||||
T1p = FNMS(KP923879532, T1o, T1l);
|
||||
T1v = FMA(KP923879532, T1o, T1l);
|
||||
}
|
||||
{
|
||||
E Tv, T1b, T1c, T1q;
|
||||
Tv = W[24];
|
||||
T1b = Tv * T1a;
|
||||
T1c = W[25];
|
||||
T1q = T1c * T1a;
|
||||
cr[WS(rs, 13)] = FNMS(T1c, T1p, T1b);
|
||||
ci[WS(rs, 13)] = FMA(Tv, T1p, T1q);
|
||||
}
|
||||
{
|
||||
E T1r, T1t, T1u, T1w;
|
||||
T1r = W[8];
|
||||
T1t = T1r * T1s;
|
||||
T1u = W[9];
|
||||
T1w = T1u * T1s;
|
||||
cr[WS(rs, 5)] = FNMS(T1u, T1v, T1t);
|
||||
ci[WS(rs, 5)] = FMA(T1r, T1v, T1w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2i, T2q, T2n, T2t;
|
||||
{
|
||||
E T2g, T2h, T2l, T2m;
|
||||
T2g = FMA(KP707106781, T1P, T1O);
|
||||
T2h = T24 + T25;
|
||||
T2i = FNMS(KP923879532, T2h, T2g);
|
||||
T2q = FMA(KP923879532, T2h, T2g);
|
||||
T2l = FNMS(KP707106781, T22, T21);
|
||||
T2m = T1W - T1T;
|
||||
T2n = FMA(KP923879532, T2m, T2l);
|
||||
T2t = FNMS(KP923879532, T2m, T2l);
|
||||
}
|
||||
{
|
||||
E T2j, T2o, T2f, T2k;
|
||||
T2f = W[12];
|
||||
T2j = T2f * T2i;
|
||||
T2o = T2f * T2n;
|
||||
T2k = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T2k, T2n, T2j);
|
||||
ci[WS(rs, 7)] = FMA(T2k, T2i, T2o);
|
||||
}
|
||||
{
|
||||
E T2r, T2u, T2p, T2s;
|
||||
T2p = W[28];
|
||||
T2r = T2p * T2q;
|
||||
T2u = T2p * T2t;
|
||||
T2s = W[29];
|
||||
cr[WS(rs, 15)] = FNMS(T2s, T2t, T2r);
|
||||
ci[WS(rs, 15)] = FMA(T2s, T2q, T2u);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1A, T1I, T1F, T1L;
|
||||
{
|
||||
E T1y, T1z, T1D, T1E;
|
||||
T1y = FMA(KP707106781, TL, TA);
|
||||
T1z = T1m + T1n;
|
||||
T1A = FNMS(KP923879532, T1z, T1y);
|
||||
T1I = FMA(KP923879532, T1z, T1y);
|
||||
T1D = FMA(KP707106781, T1k, T1h);
|
||||
T1E = T18 + TX;
|
||||
T1F = FNMS(KP923879532, T1E, T1D);
|
||||
T1L = FMA(KP923879532, T1E, T1D);
|
||||
}
|
||||
{
|
||||
E T1B, T1G, T1x, T1C;
|
||||
T1x = W[16];
|
||||
T1B = T1x * T1A;
|
||||
T1G = T1x * T1F;
|
||||
T1C = W[17];
|
||||
cr[WS(rs, 9)] = FNMS(T1C, T1F, T1B);
|
||||
ci[WS(rs, 9)] = FMA(T1C, T1A, T1G);
|
||||
}
|
||||
{
|
||||
E T1J, T1M, T1H, T1K;
|
||||
T1H = W[0];
|
||||
T1J = T1H * T1I;
|
||||
T1M = T1H * T1L;
|
||||
T1K = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T1K, T1L, T1J);
|
||||
ci[WS(rs, 1)] = FMA(T1K, T1I, T1M);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
|
||||
|
||||
void X(codelet_hb_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 84 FP multiplications,
|
||||
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 50 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
|
||||
E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
|
||||
E T2S, T30;
|
||||
{
|
||||
E T3, T1Q, T16, T1R, T6, T2i, T13, T2j;
|
||||
{
|
||||
E T1, T2, T14, T15;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T1Q = T1 - T2;
|
||||
T14 = ci[WS(rs, 11)];
|
||||
T15 = cr[WS(rs, 12)];
|
||||
T16 = T14 - T15;
|
||||
T1R = T14 + T15;
|
||||
}
|
||||
{
|
||||
E T4, T5, T11, T12;
|
||||
T4 = cr[WS(rs, 4)];
|
||||
T5 = ci[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T2i = T4 - T5;
|
||||
T11 = ci[WS(rs, 15)];
|
||||
T12 = cr[WS(rs, 8)];
|
||||
T13 = T11 - T12;
|
||||
T2j = T11 + T12;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T2K = T1Q + T1R;
|
||||
T2W = T2j - T2i;
|
||||
Tw = T3 - T6;
|
||||
T17 = T13 - T16;
|
||||
T1S = T1Q - T1R;
|
||||
T2k = T2i + T2j;
|
||||
T1w = T13 + T16;
|
||||
}
|
||||
{
|
||||
E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
|
||||
{
|
||||
E T8, T9, TA, TB;
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = ci[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
T1T = T8 - T9;
|
||||
TA = ci[WS(rs, 13)];
|
||||
TB = cr[WS(rs, 10)];
|
||||
TC = TA - TB;
|
||||
T1U = TA + TB;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tx, Ty;
|
||||
Tb = ci[WS(rs, 1)];
|
||||
Tc = cr[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
T1W = Tb - Tc;
|
||||
Tx = ci[WS(rs, 9)];
|
||||
Ty = cr[WS(rs, 14)];
|
||||
Tz = Tx - Ty;
|
||||
T1X = Tx + Ty;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TD = Tz - TC;
|
||||
T1x = TC + Tz;
|
||||
T10 = Ta - Td;
|
||||
{
|
||||
E T2l, T2m, T1V, T1Y;
|
||||
T2l = T1T + T1U;
|
||||
T2m = T1W + T1X;
|
||||
T2n = KP707106781 * (T2l - T2m);
|
||||
T2L = KP707106781 * (T2l + T2m);
|
||||
T1V = T1T - T1U;
|
||||
T1Y = T1W - T1X;
|
||||
T1Z = KP707106781 * (T1V + T1Y);
|
||||
T2X = KP707106781 * (T1V - T1Y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T2b, TL, T2c, Tl, T28, TI, T29, TF, TM;
|
||||
{
|
||||
E Tg, Th, TJ, TK;
|
||||
Tg = cr[WS(rs, 1)];
|
||||
Th = ci[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T2b = Tg - Th;
|
||||
TJ = ci[WS(rs, 10)];
|
||||
TK = cr[WS(rs, 13)];
|
||||
TL = TJ - TK;
|
||||
T2c = TJ + TK;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TG, TH;
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = ci[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
T28 = Tj - Tk;
|
||||
TG = ci[WS(rs, 14)];
|
||||
TH = cr[WS(rs, 9)];
|
||||
TI = TG - TH;
|
||||
T29 = TG + TH;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1z = TI + TL;
|
||||
TF = Ti - Tl;
|
||||
TM = TI - TL;
|
||||
TN = TF - TM;
|
||||
T19 = TF + TM;
|
||||
{
|
||||
E T2a, T2d, T2N, T2O;
|
||||
T2a = T28 + T29;
|
||||
T2d = T2b - T2c;
|
||||
T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
|
||||
T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
|
||||
T2N = T2b + T2c;
|
||||
T2O = T29 - T28;
|
||||
T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
|
||||
T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, T24, TU, T25, Ts, T21, TR, T22, TO, TV;
|
||||
{
|
||||
E Tn, To, TS, TT;
|
||||
Tn = ci[0];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T24 = Tn - To;
|
||||
TS = ci[WS(rs, 12)];
|
||||
TT = cr[WS(rs, 11)];
|
||||
TU = TS - TT;
|
||||
T25 = TS + TT;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, TP, TQ;
|
||||
Tq = cr[WS(rs, 3)];
|
||||
Tr = ci[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
T21 = Tq - Tr;
|
||||
TP = ci[WS(rs, 8)];
|
||||
TQ = cr[WS(rs, 15)];
|
||||
TR = TP - TQ;
|
||||
T22 = TP + TQ;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T1A = TR + TU;
|
||||
TO = Tp - Ts;
|
||||
TV = TR - TU;
|
||||
TW = TO + TV;
|
||||
T1a = TV - TO;
|
||||
{
|
||||
E T23, T26, T2Q, T2R;
|
||||
T23 = T21 - T22;
|
||||
T26 = T24 - T25;
|
||||
T27 = FNMS(KP382683432, T26, KP923879532 * T23);
|
||||
T2q = FMA(KP382683432, T23, KP923879532 * T26);
|
||||
T2Q = T24 + T25;
|
||||
T2R = T21 + T22;
|
||||
T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
|
||||
T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
T1u = Tf - Tu;
|
||||
T1y = T1w + T1x;
|
||||
T1B = T1z + T1A;
|
||||
T1C = T1y - T1B;
|
||||
cr[0] = Tf + Tu;
|
||||
ci[0] = T1y + T1B;
|
||||
T1t = W[14];
|
||||
T1v = W[15];
|
||||
cr[WS(rs, 8)] = FNMS(T1v, T1C, T1t * T1u);
|
||||
ci[WS(rs, 8)] = FMA(T1v, T1u, T1t * T1C);
|
||||
}
|
||||
{
|
||||
E T2U, T34, T32, T36;
|
||||
{
|
||||
E T2M, T2T, T2Y, T31;
|
||||
T2M = T2K - T2L;
|
||||
T2T = T2P + T2S;
|
||||
T2U = T2M - T2T;
|
||||
T34 = T2M + T2T;
|
||||
T2Y = T2W + T2X;
|
||||
T31 = T2Z - T30;
|
||||
T32 = T2Y - T31;
|
||||
T36 = T2Y + T31;
|
||||
}
|
||||
{
|
||||
E T2J, T2V, T33, T35;
|
||||
T2J = W[20];
|
||||
T2V = W[21];
|
||||
cr[WS(rs, 11)] = FNMS(T2V, T32, T2J * T2U);
|
||||
ci[WS(rs, 11)] = FMA(T2V, T2U, T2J * T32);
|
||||
T33 = W[4];
|
||||
T35 = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(T35, T36, T33 * T34);
|
||||
ci[WS(rs, 3)] = FMA(T35, T34, T33 * T36);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3a, T3g, T3e, T3i;
|
||||
{
|
||||
E T38, T39, T3c, T3d;
|
||||
T38 = T2K + T2L;
|
||||
T39 = T2Z + T30;
|
||||
T3a = T38 - T39;
|
||||
T3g = T38 + T39;
|
||||
T3c = T2W - T2X;
|
||||
T3d = T2P - T2S;
|
||||
T3e = T3c + T3d;
|
||||
T3i = T3c - T3d;
|
||||
}
|
||||
{
|
||||
E T37, T3b, T3f, T3h;
|
||||
T37 = W[12];
|
||||
T3b = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T3b, T3e, T37 * T3a);
|
||||
ci[WS(rs, 7)] = FMA(T37, T3e, T3b * T3a);
|
||||
T3f = W[28];
|
||||
T3h = W[29];
|
||||
cr[WS(rs, 15)] = FNMS(T3h, T3i, T3f * T3g);
|
||||
ci[WS(rs, 15)] = FMA(T3f, T3i, T3h * T3g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TY, T1e, T1c, T1g;
|
||||
{
|
||||
E TE, TX, T18, T1b;
|
||||
TE = Tw + TD;
|
||||
TX = KP707106781 * (TN + TW);
|
||||
TY = TE - TX;
|
||||
T1e = TE + TX;
|
||||
T18 = T10 + T17;
|
||||
T1b = KP707106781 * (T19 + T1a);
|
||||
T1c = T18 - T1b;
|
||||
T1g = T18 + T1b;
|
||||
}
|
||||
{
|
||||
E Tv, TZ, T1d, T1f;
|
||||
Tv = W[18];
|
||||
TZ = W[19];
|
||||
cr[WS(rs, 10)] = FNMS(TZ, T1c, Tv * TY);
|
||||
ci[WS(rs, 10)] = FMA(TZ, TY, Tv * T1c);
|
||||
T1d = W[2];
|
||||
T1f = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
|
||||
ci[WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1k, T1q, T1o, T1s;
|
||||
{
|
||||
E T1i, T1j, T1m, T1n;
|
||||
T1i = Tw - TD;
|
||||
T1j = KP707106781 * (T1a - T19);
|
||||
T1k = T1i - T1j;
|
||||
T1q = T1i + T1j;
|
||||
T1m = T17 - T10;
|
||||
T1n = KP707106781 * (TN - TW);
|
||||
T1o = T1m - T1n;
|
||||
T1s = T1m + T1n;
|
||||
}
|
||||
{
|
||||
E T1h, T1l, T1p, T1r;
|
||||
T1h = W[26];
|
||||
T1l = W[27];
|
||||
cr[WS(rs, 14)] = FNMS(T1l, T1o, T1h * T1k);
|
||||
ci[WS(rs, 14)] = FMA(T1h, T1o, T1l * T1k);
|
||||
T1p = W[10];
|
||||
T1r = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(T1r, T1s, T1p * T1q);
|
||||
ci[WS(rs, 6)] = FMA(T1p, T1s, T1r * T1q);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2g, T2u, T2s, T2w;
|
||||
{
|
||||
E T20, T2f, T2o, T2r;
|
||||
T20 = T1S - T1Z;
|
||||
T2f = T27 - T2e;
|
||||
T2g = T20 - T2f;
|
||||
T2u = T20 + T2f;
|
||||
T2o = T2k - T2n;
|
||||
T2r = T2p - T2q;
|
||||
T2s = T2o - T2r;
|
||||
T2w = T2o + T2r;
|
||||
}
|
||||
{
|
||||
E T1P, T2h, T2t, T2v;
|
||||
T1P = W[24];
|
||||
T2h = W[25];
|
||||
cr[WS(rs, 13)] = FNMS(T2h, T2s, T1P * T2g);
|
||||
ci[WS(rs, 13)] = FMA(T2h, T2g, T1P * T2s);
|
||||
T2t = W[8];
|
||||
T2v = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(T2v, T2w, T2t * T2u);
|
||||
ci[WS(rs, 5)] = FMA(T2v, T2u, T2t * T2w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2A, T2G, T2E, T2I;
|
||||
{
|
||||
E T2y, T2z, T2C, T2D;
|
||||
T2y = T1S + T1Z;
|
||||
T2z = T2p + T2q;
|
||||
T2A = T2y - T2z;
|
||||
T2G = T2y + T2z;
|
||||
T2C = T2k + T2n;
|
||||
T2D = T2e + T27;
|
||||
T2E = T2C - T2D;
|
||||
T2I = T2C + T2D;
|
||||
}
|
||||
{
|
||||
E T2x, T2B, T2F, T2H;
|
||||
T2x = W[16];
|
||||
T2B = W[17];
|
||||
cr[WS(rs, 9)] = FNMS(T2B, T2E, T2x * T2A);
|
||||
ci[WS(rs, 9)] = FMA(T2x, T2E, T2B * T2A);
|
||||
T2F = W[0];
|
||||
T2H = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T2H, T2I, T2F * T2G);
|
||||
ci[WS(rs, 1)] = FMA(T2F, T2I, T2H * T2G);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1G, T1M, T1K, T1O;
|
||||
{
|
||||
E T1E, T1F, T1I, T1J;
|
||||
T1E = T7 - Te;
|
||||
T1F = T1A - T1z;
|
||||
T1G = T1E - T1F;
|
||||
T1M = T1E + T1F;
|
||||
T1I = T1w - T1x;
|
||||
T1J = Tm - Tt;
|
||||
T1K = T1I - T1J;
|
||||
T1O = T1J + T1I;
|
||||
}
|
||||
{
|
||||
E T1D, T1H, T1L, T1N;
|
||||
T1D = W[22];
|
||||
T1H = W[23];
|
||||
cr[WS(rs, 12)] = FNMS(T1H, T1K, T1D * T1G);
|
||||
ci[WS(rs, 12)] = FMA(T1D, T1K, T1H * T1G);
|
||||
T1L = W[6];
|
||||
T1N = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(T1N, T1O, T1L * T1M);
|
||||
ci[WS(rs, 4)] = FMA(T1L, T1O, T1N * T1M);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
|
||||
|
||||
void X(codelet_hb_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_16, &desc);
|
||||
}
|
||||
#endif
|
||||
117
fftw-3.3.10/rdft/scalar/r2cb/hb_2.c
Normal file
117
fftw-3.3.10/rdft/scalar/r2cb/hb_2.c
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hb_2 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 11 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, T2, T6, T3, T4, T9;
|
||||
T1 = cr[0];
|
||||
T2 = ci[0];
|
||||
T6 = T1 - T2;
|
||||
T3 = ci[WS(rs, 1)];
|
||||
T4 = cr[WS(rs, 1)];
|
||||
T9 = T3 + T4;
|
||||
cr[0] = T1 + T2;
|
||||
ci[0] = T3 - T4;
|
||||
{
|
||||
E T5, T7, T8, Ta;
|
||||
T5 = W[0];
|
||||
T7 = T5 * T6;
|
||||
T8 = W[1];
|
||||
Ta = T8 * T6;
|
||||
cr[WS(rs, 1)] = FNMS(T8, T9, T7);
|
||||
ci[WS(rs, 1)] = FMA(T5, T9, Ta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 2, "hb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hb_2) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_2, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hb_2 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, T2, T6, T3, T4, T8, T5, T7;
|
||||
T1 = cr[0];
|
||||
T2 = ci[0];
|
||||
T6 = T1 - T2;
|
||||
T3 = ci[WS(rs, 1)];
|
||||
T4 = cr[WS(rs, 1)];
|
||||
T8 = T3 + T4;
|
||||
cr[0] = T1 + T2;
|
||||
ci[0] = T3 - T4;
|
||||
T5 = W[0];
|
||||
T7 = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T7, T8, T5 * T6);
|
||||
ci[WS(rs, 1)] = FMA(T7, T6, T5 * T8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 2, "hb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hb_2) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_2, &desc);
|
||||
}
|
||||
#endif
|
||||
1064
fftw-3.3.10/rdft/scalar/r2cb/hb_20.c
Normal file
1064
fftw-3.3.10/rdft/scalar/r2cb/hb_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1609
fftw-3.3.10/rdft/scalar/r2cb/hb_25.c
Normal file
1609
fftw-3.3.10/rdft/scalar/r2cb/hb_25.c
Normal file
File diff suppressed because it is too large
Load Diff
166
fftw-3.3.10/rdft/scalar/r2cb/hb_3.c
Normal file
166
fftw-3.3.10/rdft/scalar/r2cb/hb_3.c
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -dif -name hb_3 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 14 FP multiplications,
|
||||
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
|
||||
* 17 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, T4, T6, Tg, Td, Te, T9, Tf;
|
||||
{
|
||||
E T2, T3, T7, T8;
|
||||
T1 = cr[0];
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
T6 = FNMS(KP500000000, T4, T1);
|
||||
Tg = T2 - T3;
|
||||
Td = ci[WS(rs, 2)];
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 2)];
|
||||
Te = T7 - T8;
|
||||
T9 = T7 + T8;
|
||||
Tf = FNMS(KP500000000, Te, Td);
|
||||
}
|
||||
cr[0] = T1 + T4;
|
||||
ci[0] = Td + Te;
|
||||
{
|
||||
E Th, T5, Tb, Tc, Ti, Ta;
|
||||
Th = FMA(KP866025403, Tg, Tf);
|
||||
Ta = FNMS(KP866025403, T9, T6);
|
||||
T5 = W[0];
|
||||
Tb = T5 * Ta;
|
||||
Tc = W[1];
|
||||
Ti = Tc * Ta;
|
||||
cr[WS(rs, 1)] = FNMS(Tc, Th, Tb);
|
||||
ci[WS(rs, 1)] = FMA(T5, Th, Ti);
|
||||
}
|
||||
{
|
||||
E Tn, Tj, Tl, Tm, To, Tk;
|
||||
Tn = FNMS(KP866025403, Tg, Tf);
|
||||
Tk = FMA(KP866025403, T9, T6);
|
||||
Tj = W[2];
|
||||
Tl = Tj * Tk;
|
||||
Tm = W[3];
|
||||
To = Tm * Tk;
|
||||
cr[WS(rs, 2)] = FNMS(Tm, Tn, Tl);
|
||||
ci[WS(rs, 2)] = FMA(Tj, Tn, To);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 3, "hb_3", twinstr, &GENUS, { 6, 4, 10, 0 } };
|
||||
|
||||
void X(codelet_hb_3) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_3, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -dif -name hb_3 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 12 FP multiplications,
|
||||
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, T4, Ta, Te, T5, T8, Tb, Tf;
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T1 = cr[0];
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
Ta = FNMS(KP500000000, T4, T1);
|
||||
Te = KP866025403 * (T2 - T3);
|
||||
T5 = ci[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = cr[WS(rs, 2)];
|
||||
T8 = T6 - T7;
|
||||
Tb = KP866025403 * (T6 + T7);
|
||||
Tf = FNMS(KP500000000, T8, T5);
|
||||
}
|
||||
cr[0] = T1 + T4;
|
||||
ci[0] = T5 + T8;
|
||||
{
|
||||
E Tc, Tg, T9, Td;
|
||||
Tc = Ta - Tb;
|
||||
Tg = Te + Tf;
|
||||
T9 = W[0];
|
||||
Td = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(Td, Tg, T9 * Tc);
|
||||
ci[WS(rs, 1)] = FMA(T9, Tg, Td * Tc);
|
||||
}
|
||||
{
|
||||
E Ti, Tk, Th, Tj;
|
||||
Ti = Ta + Tb;
|
||||
Tk = Tf - Te;
|
||||
Th = W[2];
|
||||
Tj = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(Tj, Tk, Th * Ti);
|
||||
ci[WS(rs, 2)] = FMA(Th, Tk, Tj * Ti);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 3, "hb_3", twinstr, &GENUS, { 10, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hb_3) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_3, &desc);
|
||||
}
|
||||
#endif
|
||||
1843
fftw-3.3.10/rdft/scalar/r2cb/hb_32.c
Normal file
1843
fftw-3.3.10/rdft/scalar/r2cb/hb_32.c
Normal file
File diff suppressed because it is too large
Load Diff
196
fftw-3.3.10/rdft/scalar/r2cb/hb_4.c
Normal file
196
fftw-3.3.10/rdft/scalar/r2cb/hb_4.c
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hb_4 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 22 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T3, T6, T8, Td, Tx, Tu, Tm, Tg, Tr;
|
||||
{
|
||||
E Tb, Tc, Tq, Te, Tf, Tl, Tk, Tp;
|
||||
Tb = ci[WS(rs, 3)];
|
||||
Tc = cr[WS(rs, 2)];
|
||||
Tq = Tb + Tc;
|
||||
Te = ci[WS(rs, 2)];
|
||||
Tf = cr[WS(rs, 3)];
|
||||
Tl = Te + Tf;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tk = T1 - T2;
|
||||
T4 = cr[WS(rs, 1)];
|
||||
T5 = ci[0];
|
||||
T6 = T4 + T5;
|
||||
Tp = T4 - T5;
|
||||
}
|
||||
T8 = T3 - T6;
|
||||
Td = Tb - Tc;
|
||||
Tx = Tq - Tp;
|
||||
Tu = Tk + Tl;
|
||||
Tm = Tk - Tl;
|
||||
Tg = Te - Tf;
|
||||
Tr = Tp + Tq;
|
||||
}
|
||||
cr[0] = T3 + T6;
|
||||
ci[0] = Td + Tg;
|
||||
{
|
||||
E Tn, Ts, Tj, To;
|
||||
Tj = W[0];
|
||||
Tn = Tj * Tm;
|
||||
Ts = Tj * Tr;
|
||||
To = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(To, Tr, Tn);
|
||||
ci[WS(rs, 1)] = FMA(To, Tm, Ts);
|
||||
}
|
||||
{
|
||||
E Tv, Ty, Tt, Tw;
|
||||
Tt = W[4];
|
||||
Tv = Tt * Tu;
|
||||
Ty = Tt * Tx;
|
||||
Tw = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(Tw, Tx, Tv);
|
||||
ci[WS(rs, 3)] = FMA(Tw, Tu, Ty);
|
||||
}
|
||||
{
|
||||
E Th, Ta, Ti, T7, T9;
|
||||
Th = Td - Tg;
|
||||
Ta = W[3];
|
||||
Ti = Ta * T8;
|
||||
T7 = W[2];
|
||||
T9 = T7 * T8;
|
||||
cr[WS(rs, 2)] = FNMS(Ta, Th, T9);
|
||||
ci[WS(rs, 2)] = FMA(T7, Th, Ti);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hb_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hb_4 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T3, Ti, T6, Tm, Tc, Tn, Tf, Tj;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Ti = T1 - T2;
|
||||
T4 = cr[WS(rs, 1)];
|
||||
T5 = ci[0];
|
||||
T6 = T4 + T5;
|
||||
Tm = T4 - T5;
|
||||
}
|
||||
{
|
||||
E Ta, Tb, Td, Te;
|
||||
Ta = ci[WS(rs, 3)];
|
||||
Tb = cr[WS(rs, 2)];
|
||||
Tc = Ta - Tb;
|
||||
Tn = Ta + Tb;
|
||||
Td = ci[WS(rs, 2)];
|
||||
Te = cr[WS(rs, 3)];
|
||||
Tf = Td - Te;
|
||||
Tj = Td + Te;
|
||||
}
|
||||
cr[0] = T3 + T6;
|
||||
ci[0] = Tc + Tf;
|
||||
{
|
||||
E T8, Tg, T7, T9;
|
||||
T8 = T3 - T6;
|
||||
Tg = Tc - Tf;
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T9, Tg, T7 * T8);
|
||||
ci[WS(rs, 2)] = FMA(T9, T8, T7 * Tg);
|
||||
}
|
||||
{
|
||||
E Tk, To, Th, Tl;
|
||||
Tk = Ti - Tj;
|
||||
To = Tm + Tn;
|
||||
Th = W[0];
|
||||
Tl = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(Tl, To, Th * Tk);
|
||||
ci[WS(rs, 1)] = FMA(Th, To, Tl * Tk);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = Ti + Tj;
|
||||
Ts = Tn - Tm;
|
||||
Tp = W[4];
|
||||
Tr = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(Tr, Ts, Tp * Tq);
|
||||
ci[WS(rs, 3)] = FMA(Tp, Ts, Tr * Tq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hb_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_4, &desc);
|
||||
}
|
||||
#endif
|
||||
274
fftw-3.3.10/rdft/scalar/r2cb/hb_5.c
Normal file
274
fftw-3.3.10/rdft/scalar/r2cb/hb_5.c
Normal file
@@ -0,0 +1,274 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -dif -name hb_5 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 34 FP multiplications,
|
||||
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
|
||||
* 27 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr;
|
||||
{
|
||||
E T4, Tu, T7, Tv;
|
||||
T1 = cr[0];
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
Tu = T2 - T3;
|
||||
T5 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tv = T5 - T6;
|
||||
}
|
||||
Tb = T4 - T7;
|
||||
TM = FNMS(KP618033988, Tu, Tv);
|
||||
Tw = FMA(KP618033988, Tv, Tu);
|
||||
T8 = T4 + T7;
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Tf, To, Ti, Tp;
|
||||
Tn = ci[WS(rs, 4)];
|
||||
{
|
||||
E Td, Te, Tg, Th;
|
||||
Td = ci[WS(rs, 3)];
|
||||
Te = cr[WS(rs, 4)];
|
||||
Tf = Td + Te;
|
||||
To = Td - Te;
|
||||
Tg = ci[WS(rs, 2)];
|
||||
Th = cr[WS(rs, 3)];
|
||||
Ti = Tg + Th;
|
||||
Tp = Tg - Th;
|
||||
}
|
||||
Tj = FMA(KP618033988, Ti, Tf);
|
||||
TH = FNMS(KP618033988, Tf, Ti);
|
||||
Ts = To - Tp;
|
||||
Tq = To + Tp;
|
||||
Tr = FNMS(KP250000000, Tq, Tn);
|
||||
}
|
||||
cr[0] = T1 + T8;
|
||||
ci[0] = Tn + Tq;
|
||||
{
|
||||
E Tk, TA, Tx, TD, Tc, Tt;
|
||||
Tc = FMA(KP559016994, Tb, Ta);
|
||||
Tk = FNMS(KP951056516, Tj, Tc);
|
||||
TA = FMA(KP951056516, Tj, Tc);
|
||||
Tt = FMA(KP559016994, Ts, Tr);
|
||||
Tx = FMA(KP951056516, Tw, Tt);
|
||||
TD = FNMS(KP951056516, Tw, Tt);
|
||||
{
|
||||
E T9, Tl, Tm, Ty;
|
||||
T9 = W[0];
|
||||
Tl = T9 * Tk;
|
||||
Tm = W[1];
|
||||
Ty = Tm * Tk;
|
||||
cr[WS(rs, 1)] = FNMS(Tm, Tx, Tl);
|
||||
ci[WS(rs, 1)] = FMA(T9, Tx, Ty);
|
||||
}
|
||||
{
|
||||
E Tz, TB, TC, TE;
|
||||
Tz = W[6];
|
||||
TB = Tz * TA;
|
||||
TC = W[7];
|
||||
TE = TC * TA;
|
||||
cr[WS(rs, 4)] = FNMS(TC, TD, TB);
|
||||
ci[WS(rs, 4)] = FMA(Tz, TD, TE);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TQ, TN, TT, TG, TL;
|
||||
TG = FNMS(KP559016994, Tb, Ta);
|
||||
TI = FMA(KP951056516, TH, TG);
|
||||
TQ = FNMS(KP951056516, TH, TG);
|
||||
TL = FNMS(KP559016994, Ts, Tr);
|
||||
TN = FNMS(KP951056516, TM, TL);
|
||||
TT = FMA(KP951056516, TM, TL);
|
||||
{
|
||||
E TF, TJ, TK, TO;
|
||||
TF = W[2];
|
||||
TJ = TF * TI;
|
||||
TK = W[3];
|
||||
TO = TK * TI;
|
||||
cr[WS(rs, 2)] = FNMS(TK, TN, TJ);
|
||||
ci[WS(rs, 2)] = FMA(TF, TN, TO);
|
||||
}
|
||||
{
|
||||
E TP, TR, TS, TU;
|
||||
TP = W[4];
|
||||
TR = TP * TQ;
|
||||
TS = W[5];
|
||||
TU = TS * TQ;
|
||||
cr[WS(rs, 3)] = FNMS(TS, TT, TR);
|
||||
ci[WS(rs, 3)] = FMA(TP, TT, TU);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hb_5", twinstr, &GENUS, { 14, 8, 26, 0 } };
|
||||
|
||||
void X(codelet_hb_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -dif -name hb_5 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 28 FP multiplications,
|
||||
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 27 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, Tj, TG, Ts, T8, Ti, T9, Tn, TD, Tu, Tg, Tt;
|
||||
{
|
||||
E T4, Tq, T7, Tr;
|
||||
T1 = cr[0];
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
Tq = T2 - T3;
|
||||
T5 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tr = T5 - T6;
|
||||
}
|
||||
Tj = KP559016994 * (T4 - T7);
|
||||
TG = FMA(KP951056516, Tq, KP587785252 * Tr);
|
||||
Ts = FNMS(KP951056516, Tr, KP587785252 * Tq);
|
||||
T8 = T4 + T7;
|
||||
Ti = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Tc, Tl, Tf, Tm;
|
||||
T9 = ci[WS(rs, 4)];
|
||||
{
|
||||
E Ta, Tb, Td, Te;
|
||||
Ta = ci[WS(rs, 3)];
|
||||
Tb = cr[WS(rs, 4)];
|
||||
Tc = Ta - Tb;
|
||||
Tl = Ta + Tb;
|
||||
Td = ci[WS(rs, 2)];
|
||||
Te = cr[WS(rs, 3)];
|
||||
Tf = Td - Te;
|
||||
Tm = Td + Te;
|
||||
}
|
||||
Tn = FNMS(KP951056516, Tm, KP587785252 * Tl);
|
||||
TD = FMA(KP951056516, Tl, KP587785252 * Tm);
|
||||
Tu = KP559016994 * (Tc - Tf);
|
||||
Tg = Tc + Tf;
|
||||
Tt = FNMS(KP250000000, Tg, T9);
|
||||
}
|
||||
cr[0] = T1 + T8;
|
||||
ci[0] = T9 + Tg;
|
||||
{
|
||||
E To, Ty, Tw, TA, Tk, Tv;
|
||||
Tk = Ti - Tj;
|
||||
To = Tk - Tn;
|
||||
Ty = Tk + Tn;
|
||||
Tv = Tt - Tu;
|
||||
Tw = Ts + Tv;
|
||||
TA = Tv - Ts;
|
||||
{
|
||||
E Th, Tp, Tx, Tz;
|
||||
Th = W[2];
|
||||
Tp = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(Tp, Tw, Th * To);
|
||||
ci[WS(rs, 2)] = FMA(Th, Tw, Tp * To);
|
||||
Tx = W[4];
|
||||
Tz = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(Tz, TA, Tx * Ty);
|
||||
ci[WS(rs, 3)] = FMA(Tx, TA, Tz * Ty);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TE, TK, TI, TM, TC, TH;
|
||||
TC = Tj + Ti;
|
||||
TE = TC - TD;
|
||||
TK = TC + TD;
|
||||
TH = Tu + Tt;
|
||||
TI = TG + TH;
|
||||
TM = TH - TG;
|
||||
{
|
||||
E TB, TF, TJ, TL;
|
||||
TB = W[0];
|
||||
TF = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(TF, TI, TB * TE);
|
||||
ci[WS(rs, 1)] = FMA(TB, TI, TF * TE);
|
||||
TJ = W[6];
|
||||
TL = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(TL, TM, TJ * TK);
|
||||
ci[WS(rs, 4)] = FMA(TJ, TM, TL * TK);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hb_5", twinstr, &GENUS, { 26, 14, 14, 0 } };
|
||||
|
||||
void X(codelet_hb_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_5, &desc);
|
||||
}
|
||||
#endif
|
||||
292
fftw-3.3.10/rdft/scalar/r2cb/hb_6.c
Normal file
292
fftw-3.3.10/rdft/scalar/r2cb/hb_6.c
Normal file
@@ -0,0 +1,292 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hb_6 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 32 FP multiplications,
|
||||
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E Td, Tn, TO, TJ, TN, Tk, Tr, T3, TC, Ts, TQ, Ta, Tm, TF, TG;
|
||||
{
|
||||
E Tb, Tc, Tg, TH, Tj, TI;
|
||||
Tb = ci[WS(rs, 5)];
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Td = Tb - Tc;
|
||||
{
|
||||
E Te, Tf, Th, Ti;
|
||||
Te = ci[WS(rs, 3)];
|
||||
Tf = cr[WS(rs, 5)];
|
||||
Tg = Te - Tf;
|
||||
TH = Te + Tf;
|
||||
Th = ci[WS(rs, 4)];
|
||||
Ti = cr[WS(rs, 4)];
|
||||
Tj = Th - Ti;
|
||||
TI = Th + Ti;
|
||||
}
|
||||
Tn = Tj - Tg;
|
||||
TO = TH - TI;
|
||||
TJ = TH + TI;
|
||||
TN = Tb + Tc;
|
||||
Tk = Tg + Tj;
|
||||
Tr = FNMS(KP500000000, Tk, Td);
|
||||
}
|
||||
{
|
||||
E T6, TD, T9, TE, T1, T2;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
TC = T1 - T2;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[0];
|
||||
T6 = T4 + T5;
|
||||
TD = T4 - T5;
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TE = T7 - T8;
|
||||
}
|
||||
Ts = T6 - T9;
|
||||
TQ = TD - TE;
|
||||
Ta = T6 + T9;
|
||||
Tm = FNMS(KP500000000, Ta, T3);
|
||||
TF = TD + TE;
|
||||
TG = FNMS(KP500000000, TF, TC);
|
||||
}
|
||||
cr[0] = T3 + Ta;
|
||||
ci[0] = Td + Tk;
|
||||
{
|
||||
E To, Tt, Tp, Tu, Tl, Tq;
|
||||
To = FNMS(KP866025403, Tn, Tm);
|
||||
Tt = FNMS(KP866025403, Ts, Tr);
|
||||
Tl = W[2];
|
||||
Tp = Tl * To;
|
||||
Tu = Tl * Tt;
|
||||
Tq = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(Tq, Tt, Tp);
|
||||
ci[WS(rs, 2)] = FMA(Tq, To, Tu);
|
||||
}
|
||||
{
|
||||
E T13, TZ, T11, T12, T14, T10;
|
||||
T13 = TN + TO;
|
||||
T10 = TC + TF;
|
||||
TZ = W[4];
|
||||
T11 = TZ * T10;
|
||||
T12 = W[5];
|
||||
T14 = T12 * T10;
|
||||
cr[WS(rs, 3)] = FNMS(T12, T13, T11);
|
||||
ci[WS(rs, 3)] = FMA(TZ, T13, T14);
|
||||
}
|
||||
{
|
||||
E Tw, Tz, Tx, TA, Tv, Ty;
|
||||
Tw = FMA(KP866025403, Tn, Tm);
|
||||
Tz = FMA(KP866025403, Ts, Tr);
|
||||
Tv = W[6];
|
||||
Tx = Tv * Tw;
|
||||
TA = Tv * Tz;
|
||||
Ty = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(Ty, Tz, Tx);
|
||||
ci[WS(rs, 4)] = FMA(Ty, Tw, TA);
|
||||
}
|
||||
{
|
||||
E TR, TX, TT, TV, TW, TY, TB, TL, TM, TS, TP, TU, TK;
|
||||
TP = FNMS(KP500000000, TO, TN);
|
||||
TR = FMA(KP866025403, TQ, TP);
|
||||
TX = FNMS(KP866025403, TQ, TP);
|
||||
TU = FMA(KP866025403, TJ, TG);
|
||||
TT = W[8];
|
||||
TV = TT * TU;
|
||||
TW = W[9];
|
||||
TY = TW * TU;
|
||||
TK = FNMS(KP866025403, TJ, TG);
|
||||
TB = W[0];
|
||||
TL = TB * TK;
|
||||
TM = W[1];
|
||||
TS = TM * TK;
|
||||
cr[WS(rs, 1)] = FNMS(TM, TR, TL);
|
||||
ci[WS(rs, 1)] = FMA(TB, TR, TS);
|
||||
cr[WS(rs, 5)] = FNMS(TW, TX, TV);
|
||||
ci[WS(rs, 5)] = FMA(TT, TX, TY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 6, "hb_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
|
||||
|
||||
void X(codelet_hb_6) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_6, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hb_6 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 28 FP multiplications,
|
||||
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 27 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T3, Ty, Ta, TO, Tr, TB, Td, TE, Tk, TL, Tn, TH;
|
||||
{
|
||||
E T1, T2, Tb, Tc;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
Ty = T1 - T2;
|
||||
{
|
||||
E T6, Tz, T9, TA;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[0];
|
||||
T6 = T4 + T5;
|
||||
Tz = T4 - T5;
|
||||
T7 = ci[WS(rs, 1)];
|
||||
T8 = cr[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TA = T7 - T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TO = KP866025403 * (Tz - TA);
|
||||
Tr = KP866025403 * (T6 - T9);
|
||||
TB = Tz + TA;
|
||||
}
|
||||
Tb = ci[WS(rs, 5)];
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Td = Tb - Tc;
|
||||
TE = Tb + Tc;
|
||||
{
|
||||
E Tg, TG, Tj, TF;
|
||||
{
|
||||
E Te, Tf, Th, Ti;
|
||||
Te = ci[WS(rs, 3)];
|
||||
Tf = cr[WS(rs, 5)];
|
||||
Tg = Te - Tf;
|
||||
TG = Te + Tf;
|
||||
Th = ci[WS(rs, 4)];
|
||||
Ti = cr[WS(rs, 4)];
|
||||
Tj = Th - Ti;
|
||||
TF = Th + Ti;
|
||||
}
|
||||
Tk = Tg + Tj;
|
||||
TL = KP866025403 * (TG + TF);
|
||||
Tn = KP866025403 * (Tj - Tg);
|
||||
TH = TF - TG;
|
||||
}
|
||||
}
|
||||
cr[0] = T3 + Ta;
|
||||
ci[0] = Td + Tk;
|
||||
{
|
||||
E TC, TI, Tx, TD;
|
||||
TC = Ty + TB;
|
||||
TI = TE - TH;
|
||||
Tx = W[4];
|
||||
TD = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TD, TI, Tx * TC);
|
||||
ci[WS(rs, 3)] = FMA(TD, TC, Tx * TI);
|
||||
}
|
||||
{
|
||||
E To, Tu, Ts, Tw, Tm, Tq;
|
||||
Tm = FNMS(KP500000000, Ta, T3);
|
||||
To = Tm - Tn;
|
||||
Tu = Tm + Tn;
|
||||
Tq = FNMS(KP500000000, Tk, Td);
|
||||
Ts = Tq - Tr;
|
||||
Tw = Tr + Tq;
|
||||
{
|
||||
E Tl, Tp, Tt, Tv;
|
||||
Tl = W[2];
|
||||
Tp = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(Tp, Ts, Tl * To);
|
||||
ci[WS(rs, 2)] = FMA(Tl, Ts, Tp * To);
|
||||
Tt = W[6];
|
||||
Tv = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(Tv, Tw, Tt * Tu);
|
||||
ci[WS(rs, 4)] = FMA(Tt, Tw, Tv * Tu);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, TS, TQ, TU, TK, TP;
|
||||
TK = FNMS(KP500000000, TB, Ty);
|
||||
TM = TK - TL;
|
||||
TS = TK + TL;
|
||||
TP = FMA(KP500000000, TH, TE);
|
||||
TQ = TO + TP;
|
||||
TU = TP - TO;
|
||||
{
|
||||
E TJ, TN, TR, TT;
|
||||
TJ = W[0];
|
||||
TN = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(TN, TQ, TJ * TM);
|
||||
ci[WS(rs, 1)] = FMA(TN, TM, TJ * TQ);
|
||||
TR = W[8];
|
||||
TT = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(TT, TU, TR * TS);
|
||||
ci[WS(rs, 5)] = FMA(TT, TS, TR * TU);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 6, "hb_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
|
||||
|
||||
void X(codelet_hb_6) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_6, &desc);
|
||||
}
|
||||
#endif
|
||||
4025
fftw-3.3.10/rdft/scalar/r2cb/hb_64.c
Normal file
4025
fftw-3.3.10/rdft/scalar/r2cb/hb_64.c
Normal file
File diff suppressed because it is too large
Load Diff
356
fftw-3.3.10/rdft/scalar/r2cb/hb_7.c
Normal file
356
fftw-3.3.10/rdft/scalar/r2cb/hb_7.c
Normal file
@@ -0,0 +1,356 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 66 FP multiplications,
|
||||
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
|
||||
* 41 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, T4, TC, T7, TB, Ta, TA, TD, TZ, T1l, T1b, TP, Td, Tt, Tw;
|
||||
E Tv, Tu, Tp, Ty, T1j, T1e, TX, TS;
|
||||
T1 = cr[0];
|
||||
{
|
||||
E T2, T3, T1a, TO, Tc;
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
TC = T2 - T3;
|
||||
{
|
||||
E T5, T6, T8, T9;
|
||||
T5 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
TB = T5 - T6;
|
||||
T8 = cr[WS(rs, 3)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
TA = T8 - T9;
|
||||
}
|
||||
TD = FNMS(KP554958132, TC, TB);
|
||||
TZ = FMA(KP554958132, TB, TA);
|
||||
T1l = FMA(KP554958132, TA, TC);
|
||||
T1a = FNMS(KP356895867, T7, T4);
|
||||
T1b = FNMS(KP692021471, T1a, Ta);
|
||||
TO = FNMS(KP356895867, T4, Ta);
|
||||
TP = FNMS(KP692021471, TO, T7);
|
||||
Tc = FNMS(KP356895867, Ta, T7);
|
||||
Td = FNMS(KP692021471, Tc, T4);
|
||||
}
|
||||
Tt = ci[WS(rs, 6)];
|
||||
{
|
||||
E Th, Tk, Tn, Tf, Tg;
|
||||
Tf = ci[WS(rs, 3)];
|
||||
Tg = cr[WS(rs, 4)];
|
||||
Th = Tf + Tg;
|
||||
Tw = Tf - Tg;
|
||||
{
|
||||
E Ti, Tj, Tl, Tm;
|
||||
Ti = ci[WS(rs, 4)];
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
Tv = Ti - Tj;
|
||||
Tl = ci[WS(rs, 5)];
|
||||
Tm = cr[WS(rs, 6)];
|
||||
Tn = Tl + Tm;
|
||||
Tu = Tl - Tm;
|
||||
}
|
||||
{
|
||||
E To, Tx, T1i, T1d, TW, TR;
|
||||
To = FNMS(KP554958132, Tn, Tk);
|
||||
Tp = FNMS(KP801937735, To, Th);
|
||||
Tx = FNMS(KP356895867, Tw, Tv);
|
||||
Ty = FNMS(KP692021471, Tx, Tu);
|
||||
T1i = FNMS(KP356895867, Tv, Tu);
|
||||
T1j = FNMS(KP692021471, T1i, Tw);
|
||||
T1d = FMA(KP554958132, Th, Tn);
|
||||
T1e = FMA(KP801937735, T1d, Tk);
|
||||
TW = FNMS(KP356895867, Tu, Tw);
|
||||
TX = FNMS(KP692021471, TW, Tv);
|
||||
TR = FMA(KP554958132, Tk, Th);
|
||||
TS = FNMS(KP801937735, TR, Tn);
|
||||
}
|
||||
}
|
||||
cr[0] = T1 + T4 + T7 + Ta;
|
||||
ci[0] = Tt + Tu + Tv + Tw;
|
||||
{
|
||||
E Tq, TI, TF, TL, Te, Tz, TE;
|
||||
Te = FNMS(KP900968867, Td, T1);
|
||||
Tq = FNMS(KP974927912, Tp, Te);
|
||||
TI = FMA(KP974927912, Tp, Te);
|
||||
Tz = FNMS(KP900968867, Ty, Tt);
|
||||
TE = FNMS(KP801937735, TD, TA);
|
||||
TF = FMA(KP974927912, TE, Tz);
|
||||
TL = FNMS(KP974927912, TE, Tz);
|
||||
{
|
||||
E Tb, Tr, Ts, TG;
|
||||
Tb = W[4];
|
||||
Tr = Tb * Tq;
|
||||
Ts = W[5];
|
||||
TG = Ts * Tq;
|
||||
cr[WS(rs, 3)] = FNMS(Ts, TF, Tr);
|
||||
ci[WS(rs, 3)] = FMA(Tb, TF, TG);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TK, TM;
|
||||
TH = W[6];
|
||||
TJ = TH * TI;
|
||||
TK = W[7];
|
||||
TM = TK * TI;
|
||||
cr[WS(rs, 4)] = FNMS(TK, TL, TJ);
|
||||
ci[WS(rs, 4)] = FMA(TH, TL, TM);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TT, T14, T11, T17, TQ, TY, T10;
|
||||
TQ = FNMS(KP900968867, TP, T1);
|
||||
TT = FNMS(KP974927912, TS, TQ);
|
||||
T14 = FMA(KP974927912, TS, TQ);
|
||||
TY = FNMS(KP900968867, TX, Tt);
|
||||
T10 = FNMS(KP801937735, TZ, TC);
|
||||
T11 = FMA(KP974927912, T10, TY);
|
||||
T17 = FNMS(KP974927912, T10, TY);
|
||||
{
|
||||
E TN, TU, TV, T12;
|
||||
TN = W[2];
|
||||
TU = TN * TT;
|
||||
TV = W[3];
|
||||
T12 = TV * TT;
|
||||
cr[WS(rs, 2)] = FNMS(TV, T11, TU);
|
||||
ci[WS(rs, 2)] = FMA(TN, T11, T12);
|
||||
}
|
||||
{
|
||||
E T13, T15, T16, T18;
|
||||
T13 = W[8];
|
||||
T15 = T13 * T14;
|
||||
T16 = W[9];
|
||||
T18 = T16 * T14;
|
||||
cr[WS(rs, 5)] = FNMS(T16, T17, T15);
|
||||
ci[WS(rs, 5)] = FMA(T13, T17, T18);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1f, T1q, T1n, T1t, T1c, T1k, T1m;
|
||||
T1c = FNMS(KP900968867, T1b, T1);
|
||||
T1f = FNMS(KP974927912, T1e, T1c);
|
||||
T1q = FMA(KP974927912, T1e, T1c);
|
||||
T1k = FNMS(KP900968867, T1j, Tt);
|
||||
T1m = FMA(KP801937735, T1l, TB);
|
||||
T1n = FMA(KP974927912, T1m, T1k);
|
||||
T1t = FNMS(KP974927912, T1m, T1k);
|
||||
{
|
||||
E T19, T1g, T1h, T1o;
|
||||
T19 = W[0];
|
||||
T1g = T19 * T1f;
|
||||
T1h = W[1];
|
||||
T1o = T1h * T1f;
|
||||
cr[WS(rs, 1)] = FNMS(T1h, T1n, T1g);
|
||||
ci[WS(rs, 1)] = FMA(T19, T1n, T1o);
|
||||
}
|
||||
{
|
||||
E T1p, T1r, T1s, T1u;
|
||||
T1p = W[10];
|
||||
T1r = T1p * T1q;
|
||||
T1s = W[11];
|
||||
T1u = T1s * T1q;
|
||||
cr[WS(rs, 6)] = FNMS(T1s, T1t, T1r);
|
||||
ci[WS(rs, 6)] = FMA(T1p, T1t, T1u);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, { 18, 12, 54, 0 } };
|
||||
|
||||
void X(codelet_hb_7) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_7, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 60 FP multiplications,
|
||||
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
|
||||
* 36 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, T4, T7, Ta, Tx, TI, TV, TQ, TE, Tm, Tb, Te, Th, Tk, Tq;
|
||||
E TF, TR, TU, TJ, Tt;
|
||||
{
|
||||
E Tu, Tw, Tv, T2, T3;
|
||||
T1 = cr[0];
|
||||
T2 = cr[WS(rs, 1)];
|
||||
T3 = ci[0];
|
||||
T4 = T2 + T3;
|
||||
Tu = T2 - T3;
|
||||
{
|
||||
E T5, T6, T8, T9;
|
||||
T5 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tw = T5 - T6;
|
||||
T8 = cr[WS(rs, 3)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tv = T8 - T9;
|
||||
}
|
||||
Tx = FMA(KP433883739, Tu, KP974927912 * Tv) - (KP781831482 * Tw);
|
||||
TI = FMA(KP781831482, Tu, KP974927912 * Tw) + (KP433883739 * Tv);
|
||||
TV = FNMS(KP781831482, Tv, KP974927912 * Tu) - (KP433883739 * Tw);
|
||||
TQ = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
|
||||
TE = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
|
||||
Tm = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
|
||||
}
|
||||
{
|
||||
E Tp, Tn, To, Tc, Td;
|
||||
Tb = ci[WS(rs, 6)];
|
||||
Tc = ci[WS(rs, 5)];
|
||||
Td = cr[WS(rs, 6)];
|
||||
Te = Tc - Td;
|
||||
Tp = Tc + Td;
|
||||
{
|
||||
E Tf, Tg, Ti, Tj;
|
||||
Tf = ci[WS(rs, 4)];
|
||||
Tg = cr[WS(rs, 5)];
|
||||
Th = Tf - Tg;
|
||||
Tn = Tf + Tg;
|
||||
Ti = ci[WS(rs, 3)];
|
||||
Tj = cr[WS(rs, 4)];
|
||||
Tk = Ti - Tj;
|
||||
To = Ti + Tj;
|
||||
}
|
||||
Tq = FNMS(KP974927912, To, KP781831482 * Tn) - (KP433883739 * Tp);
|
||||
TF = FMA(KP781831482, Tp, KP974927912 * Tn) + (KP433883739 * To);
|
||||
TR = FMA(KP433883739, Tn, KP781831482 * To) - (KP974927912 * Tp);
|
||||
TU = FMA(KP623489801, Tk, Tb) + FNMA(KP900968867, Th, KP222520933 * Te);
|
||||
TJ = FMA(KP623489801, Te, Tb) + FNMA(KP900968867, Tk, KP222520933 * Th);
|
||||
Tt = FMA(KP623489801, Th, Tb) + FNMA(KP222520933, Tk, KP900968867 * Te);
|
||||
}
|
||||
cr[0] = T1 + T4 + T7 + Ta;
|
||||
ci[0] = Tb + Te + Th + Tk;
|
||||
{
|
||||
E Tr, Ty, Tl, Ts;
|
||||
Tr = Tm - Tq;
|
||||
Ty = Tt - Tx;
|
||||
Tl = W[6];
|
||||
Ts = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(Ts, Ty, Tl * Tr);
|
||||
ci[WS(rs, 4)] = FMA(Tl, Ty, Ts * Tr);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = TQ + TR;
|
||||
T10 = TV + TU;
|
||||
TX = W[2];
|
||||
TZ = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(TZ, T10, TX * TY);
|
||||
ci[WS(rs, 2)] = FMA(TX, T10, TZ * TY);
|
||||
}
|
||||
{
|
||||
E TA, TC, Tz, TB;
|
||||
TA = Tm + Tq;
|
||||
TC = Tx + Tt;
|
||||
Tz = W[4];
|
||||
TB = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TB, TC, Tz * TA);
|
||||
ci[WS(rs, 3)] = FMA(Tz, TC, TB * TA);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = TE + TF;
|
||||
TO = TJ - TI;
|
||||
TL = W[10];
|
||||
TN = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(TN, TO, TL * TM);
|
||||
ci[WS(rs, 6)] = FMA(TL, TO, TN * TM);
|
||||
}
|
||||
{
|
||||
E TS, TW, TP, TT;
|
||||
TS = TQ - TR;
|
||||
TW = TU - TV;
|
||||
TP = W[8];
|
||||
TT = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(TT, TW, TP * TS);
|
||||
ci[WS(rs, 5)] = FMA(TP, TW, TT * TS);
|
||||
}
|
||||
{
|
||||
E TG, TK, TD, TH;
|
||||
TG = TE - TF;
|
||||
TK = TI + TJ;
|
||||
TD = W[0];
|
||||
TH = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(TH, TK, TD * TG);
|
||||
ci[WS(rs, 1)] = FMA(TD, TK, TH * TG);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, { 36, 24, 36, 0 } };
|
||||
|
||||
void X(codelet_hb_7) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_7, &desc);
|
||||
}
|
||||
#endif
|
||||
373
fftw-3.3.10/rdft/scalar/r2cb/hb_8.c
Normal file
373
fftw-3.3.10/rdft/scalar/r2cb/hb_8.c
Normal file
@@ -0,0 +1,373 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 33 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, T1i, T1n, Tk, TD, TV, T1b, TQ, Te, T1e, T1o, T1j, TE, TF, TR;
|
||||
E Tv, TW;
|
||||
{
|
||||
E T3, Tg, TC, T19, T6, Tz, Tj, T1a;
|
||||
{
|
||||
E T1, T2, TA, TB;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
Tg = T1 - T2;
|
||||
TA = ci[WS(rs, 7)];
|
||||
TB = cr[WS(rs, 4)];
|
||||
TC = TA + TB;
|
||||
T19 = TA - TB;
|
||||
}
|
||||
{
|
||||
E T4, T5, Th, Ti;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
Tz = T4 - T5;
|
||||
Th = ci[WS(rs, 5)];
|
||||
Ti = cr[WS(rs, 6)];
|
||||
Tj = Th + Ti;
|
||||
T1a = Th - Ti;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1i = T3 - T6;
|
||||
T1n = T19 - T1a;
|
||||
Tk = Tg - Tj;
|
||||
TD = Tz + TC;
|
||||
TV = TC - Tz;
|
||||
T1b = T19 + T1a;
|
||||
TQ = Tg + Tj;
|
||||
}
|
||||
{
|
||||
E Ta, Tl, Tt, T1d, Td, Tq, To, T1c, Tp, Tu;
|
||||
{
|
||||
E T8, T9, Tr, Ts;
|
||||
T8 = cr[WS(rs, 1)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tl = T8 - T9;
|
||||
Tr = ci[WS(rs, 4)];
|
||||
Ts = cr[WS(rs, 7)];
|
||||
Tt = Tr + Ts;
|
||||
T1d = Tr - Ts;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tm, Tn;
|
||||
Tb = ci[0];
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
Tq = Tb - Tc;
|
||||
Tm = ci[WS(rs, 6)];
|
||||
Tn = cr[WS(rs, 5)];
|
||||
To = Tm + Tn;
|
||||
T1c = Tm - Tn;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1e = T1c + T1d;
|
||||
T1o = Ta - Td;
|
||||
T1j = T1d - T1c;
|
||||
TE = Tl + To;
|
||||
TF = Tq + Tt;
|
||||
TR = TE + TF;
|
||||
Tp = Tl - To;
|
||||
Tu = Tq - Tt;
|
||||
Tv = Tp + Tu;
|
||||
TW = Tp - Tu;
|
||||
}
|
||||
cr[0] = T7 + Te;
|
||||
ci[0] = T1b + T1e;
|
||||
{
|
||||
E TS, TX, TT, TY, TP, TU;
|
||||
TS = FNMS(KP707106781, TR, TQ);
|
||||
TX = FMA(KP707106781, TW, TV);
|
||||
TP = W[4];
|
||||
TT = TP * TS;
|
||||
TY = TP * TX;
|
||||
TU = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TU, TX, TT);
|
||||
ci[WS(rs, 3)] = FMA(TU, TS, TY);
|
||||
}
|
||||
{
|
||||
E T1s, T1v, T1t, T1w, T1r, T1u;
|
||||
T1s = T1i + T1j;
|
||||
T1v = T1o + T1n;
|
||||
T1r = W[2];
|
||||
T1t = T1r * T1s;
|
||||
T1w = T1r * T1v;
|
||||
T1u = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
|
||||
ci[WS(rs, 2)] = FMA(T1u, T1s, T1w);
|
||||
}
|
||||
{
|
||||
E T10, T13, T11, T14, TZ, T12;
|
||||
T10 = FMA(KP707106781, TR, TQ);
|
||||
T13 = FNMS(KP707106781, TW, TV);
|
||||
TZ = W[12];
|
||||
T11 = TZ * T10;
|
||||
T14 = TZ * T13;
|
||||
T12 = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T12, T13, T11);
|
||||
ci[WS(rs, 7)] = FMA(T12, T10, T14);
|
||||
}
|
||||
{
|
||||
E T1f, T15, T17, T18, T1g, T16;
|
||||
T1f = T1b - T1e;
|
||||
T16 = T7 - Te;
|
||||
T15 = W[6];
|
||||
T17 = T15 * T16;
|
||||
T18 = W[7];
|
||||
T1g = T18 * T16;
|
||||
cr[WS(rs, 4)] = FNMS(T18, T1f, T17);
|
||||
ci[WS(rs, 4)] = FMA(T15, T1f, T1g);
|
||||
}
|
||||
{
|
||||
E T1k, T1p, T1l, T1q, T1h, T1m;
|
||||
T1k = T1i - T1j;
|
||||
T1p = T1n - T1o;
|
||||
T1h = W[10];
|
||||
T1l = T1h * T1k;
|
||||
T1q = T1h * T1p;
|
||||
T1m = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(T1m, T1p, T1l);
|
||||
ci[WS(rs, 6)] = FMA(T1m, T1k, T1q);
|
||||
}
|
||||
{
|
||||
E TH, TN, TJ, TL, TM, TO, Tf, Tx, Ty, TI, TG, TK, Tw;
|
||||
TG = TE - TF;
|
||||
TH = FNMS(KP707106781, TG, TD);
|
||||
TN = FMA(KP707106781, TG, TD);
|
||||
TK = FMA(KP707106781, Tv, Tk);
|
||||
TJ = W[0];
|
||||
TL = TJ * TK;
|
||||
TM = W[1];
|
||||
TO = TM * TK;
|
||||
Tw = FNMS(KP707106781, Tv, Tk);
|
||||
Tf = W[8];
|
||||
Tx = Tf * Tw;
|
||||
Ty = W[9];
|
||||
TI = Ty * Tw;
|
||||
cr[WS(rs, 5)] = FNMS(Ty, TH, Tx);
|
||||
ci[WS(rs, 5)] = FMA(Tf, TH, TI);
|
||||
cr[WS(rs, 1)] = FNMS(TM, TN, TL);
|
||||
ci[WS(rs, 1)] = FMA(TJ, TN, TO);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
|
||||
|
||||
void X(codelet_hb_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 32 FP multiplications,
|
||||
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 30 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
|
||||
E TD;
|
||||
{
|
||||
E T3, TK, Tn, TL, T6, TW, Tk, TX;
|
||||
{
|
||||
E T1, T2, Tl, Tm;
|
||||
T1 = cr[0];
|
||||
T2 = ci[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
TK = T1 - T2;
|
||||
Tl = ci[WS(rs, 5)];
|
||||
Tm = cr[WS(rs, 6)];
|
||||
Tn = Tl - Tm;
|
||||
TL = Tl + Tm;
|
||||
}
|
||||
{
|
||||
E T4, T5, Ti, Tj;
|
||||
T4 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
TW = T4 - T5;
|
||||
Ti = ci[WS(rs, 7)];
|
||||
Tj = cr[WS(rs, 4)];
|
||||
Tk = Ti - Tj;
|
||||
TX = Ti + Tj;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T18 = TK + TL;
|
||||
T1c = TX - TW;
|
||||
To = Tk + Tn;
|
||||
Ty = T3 - T6;
|
||||
TM = TK - TL;
|
||||
TY = TW + TX;
|
||||
TC = Tk - Tn;
|
||||
}
|
||||
{
|
||||
E Ta, TN, Tu, TR, Td, TQ, Tr, TO;
|
||||
{
|
||||
E T8, T9, Ts, Tt;
|
||||
T8 = cr[WS(rs, 1)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
TN = T8 - T9;
|
||||
Ts = ci[WS(rs, 4)];
|
||||
Tt = cr[WS(rs, 7)];
|
||||
Tu = Ts - Tt;
|
||||
TR = Ts + Tt;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tp, Tq;
|
||||
Tb = ci[0];
|
||||
Tc = cr[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
TQ = Tb - Tc;
|
||||
Tp = ci[WS(rs, 6)];
|
||||
Tq = cr[WS(rs, 5)];
|
||||
Tr = Tp - Tq;
|
||||
TO = Tp + Tq;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TZ = TN + TO;
|
||||
T10 = TQ + TR;
|
||||
Tv = Tr + Tu;
|
||||
Tz = Tu - Tr;
|
||||
TP = TN - TO;
|
||||
TS = TQ - TR;
|
||||
TD = Ta - Td;
|
||||
}
|
||||
cr[0] = T7 + Te;
|
||||
ci[0] = To + Tv;
|
||||
{
|
||||
E Tg, Tw, Tf, Th;
|
||||
Tg = T7 - Te;
|
||||
Tw = To - Tv;
|
||||
Tf = W[6];
|
||||
Th = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(Th, Tw, Tf * Tg);
|
||||
ci[WS(rs, 4)] = FMA(Th, Tg, Tf * Tw);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = Ty + Tz;
|
||||
TI = TD + TC;
|
||||
TF = W[2];
|
||||
TH = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(TH, TI, TF * TG);
|
||||
ci[WS(rs, 2)] = FMA(TF, TI, TH * TG);
|
||||
}
|
||||
{
|
||||
E TA, TE, Tx, TB;
|
||||
TA = Ty - Tz;
|
||||
TE = TC - TD;
|
||||
Tx = W[10];
|
||||
TB = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
|
||||
ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
|
||||
}
|
||||
{
|
||||
E T1a, T1g, T1e, T1i, T19, T1d;
|
||||
T19 = KP707106781 * (TZ + T10);
|
||||
T1a = T18 - T19;
|
||||
T1g = T18 + T19;
|
||||
T1d = KP707106781 * (TP - TS);
|
||||
T1e = T1c + T1d;
|
||||
T1i = T1c - T1d;
|
||||
{
|
||||
E T17, T1b, T1f, T1h;
|
||||
T17 = W[4];
|
||||
T1b = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(T1b, T1e, T17 * T1a);
|
||||
ci[WS(rs, 3)] = FMA(T17, T1e, T1b * T1a);
|
||||
T1f = W[12];
|
||||
T1h = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T1h, T1i, T1f * T1g);
|
||||
ci[WS(rs, 7)] = FMA(T1f, T1i, T1h * T1g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TU, T14, T12, T16, TT, T11;
|
||||
TT = KP707106781 * (TP + TS);
|
||||
TU = TM - TT;
|
||||
T14 = TM + TT;
|
||||
T11 = KP707106781 * (TZ - T10);
|
||||
T12 = TY - T11;
|
||||
T16 = TY + T11;
|
||||
{
|
||||
E TJ, TV, T13, T15;
|
||||
TJ = W[8];
|
||||
TV = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(TV, T12, TJ * TU);
|
||||
ci[WS(rs, 5)] = FMA(TV, TU, TJ * T12);
|
||||
T13 = W[0];
|
||||
T15 = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T15, T16, T13 * T14);
|
||||
ci[WS(rs, 1)] = FMA(T15, T14, T13 * T16);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hb_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_8, &desc);
|
||||
}
|
||||
#endif
|
||||
497
fftw-3.3.10/rdft/scalar/r2cb/hb_9.c
Normal file
497
fftw-3.3.10/rdft/scalar/r2cb/hb_9.c
Normal file
@@ -0,0 +1,497 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -dif -name hb_9 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 88 FP multiplications,
|
||||
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
|
||||
* 53 stack variables, 10 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
|
||||
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
|
||||
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
|
||||
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
|
||||
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T5, Tl, TQ, T1y, T1b, T1J, Tg, TE, Tw, Tz, T1E, T1L, T1B, T1K, T14;
|
||||
E T1d, TX, T1c;
|
||||
{
|
||||
E T1, Th, T4, T1a, Tk, TP, TO, T19;
|
||||
T1 = cr[0];
|
||||
Th = ci[WS(rs, 8)];
|
||||
{
|
||||
E T2, T3, Ti, Tj;
|
||||
T2 = cr[WS(rs, 3)];
|
||||
T3 = ci[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T1a = T2 - T3;
|
||||
Ti = ci[WS(rs, 5)];
|
||||
Tj = cr[WS(rs, 6)];
|
||||
Tk = Ti - Tj;
|
||||
TP = Ti + Tj;
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
Tl = Th + Tk;
|
||||
TO = FNMS(KP500000000, T4, T1);
|
||||
TQ = FNMS(KP866025403, TP, TO);
|
||||
T1y = FMA(KP866025403, TP, TO);
|
||||
T19 = FNMS(KP500000000, Tk, Th);
|
||||
T1b = FMA(KP866025403, T1a, T19);
|
||||
T1J = FNMS(KP866025403, T1a, T19);
|
||||
}
|
||||
{
|
||||
E T6, T9, TY, T12, Tm, Tp, TZ, T11, Tb, Te, TS, TU, Tr, Tu, TR;
|
||||
E TV;
|
||||
{
|
||||
E T7, T8, Tn, To;
|
||||
T6 = cr[WS(rs, 1)];
|
||||
T7 = cr[WS(rs, 4)];
|
||||
T8 = ci[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TY = FNMS(KP500000000, T9, T6);
|
||||
T12 = T7 - T8;
|
||||
Tm = ci[WS(rs, 7)];
|
||||
Tn = ci[WS(rs, 4)];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn - To;
|
||||
TZ = Tn + To;
|
||||
T11 = FMS(KP500000000, Tp, Tm);
|
||||
}
|
||||
{
|
||||
E Tc, Td, Ts, Tt;
|
||||
Tb = cr[WS(rs, 2)];
|
||||
Tc = ci[WS(rs, 3)];
|
||||
Td = ci[0];
|
||||
Te = Tc + Td;
|
||||
TS = Td - Tc;
|
||||
TU = FNMS(KP500000000, Te, Tb);
|
||||
Tr = ci[WS(rs, 6)];
|
||||
Ts = cr[WS(rs, 5)];
|
||||
Tt = cr[WS(rs, 8)];
|
||||
Tu = Ts + Tt;
|
||||
TR = FMA(KP500000000, Tu, Tr);
|
||||
TV = Ts - Tt;
|
||||
}
|
||||
{
|
||||
E Ta, Tf, T1z, T1A;
|
||||
Ta = T6 + T9;
|
||||
Tf = Tb + Te;
|
||||
Tg = Ta + Tf;
|
||||
TE = Ta - Tf;
|
||||
{
|
||||
E Tq, Tv, T1C, T1D;
|
||||
Tq = Tm + Tp;
|
||||
Tv = Tr - Tu;
|
||||
Tw = Tq + Tv;
|
||||
Tz = Tv - Tq;
|
||||
T1C = FNMS(KP866025403, TV, TU);
|
||||
T1D = FMA(KP866025403, TS, TR);
|
||||
T1E = FMA(KP363970234, T1D, T1C);
|
||||
T1L = FNMS(KP363970234, T1C, T1D);
|
||||
}
|
||||
T1z = FMA(KP866025403, T12, T11);
|
||||
T1A = FMA(KP866025403, TZ, TY);
|
||||
T1B = FMA(KP176326980, T1A, T1z);
|
||||
T1K = FNMS(KP176326980, T1z, T1A);
|
||||
{
|
||||
E T10, T13, TT, TW;
|
||||
T10 = FNMS(KP866025403, TZ, TY);
|
||||
T13 = FNMS(KP866025403, T12, T11);
|
||||
T14 = FMA(KP839099631, T13, T10);
|
||||
T1d = FNMS(KP839099631, T10, T13);
|
||||
TT = FNMS(KP866025403, TS, TR);
|
||||
TW = FMA(KP866025403, TV, TU);
|
||||
TX = FNMS(KP176326980, TW, TT);
|
||||
T1c = FMA(KP176326980, TT, TW);
|
||||
}
|
||||
}
|
||||
}
|
||||
cr[0] = T5 + Tg;
|
||||
ci[0] = Tl + Tw;
|
||||
{
|
||||
E TA, TI, TF, TL, Ty, TD;
|
||||
Ty = FNMS(KP500000000, Tg, T5);
|
||||
TA = FNMS(KP866025403, Tz, Ty);
|
||||
TI = FMA(KP866025403, Tz, Ty);
|
||||
TD = FNMS(KP500000000, Tw, Tl);
|
||||
TF = FNMS(KP866025403, TE, TD);
|
||||
TL = FMA(KP866025403, TE, TD);
|
||||
{
|
||||
E TB, TG, Tx, TC;
|
||||
Tx = W[10];
|
||||
TB = Tx * TA;
|
||||
TG = Tx * TF;
|
||||
TC = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(TC, TF, TB);
|
||||
ci[WS(rs, 6)] = FMA(TC, TA, TG);
|
||||
}
|
||||
{
|
||||
E TJ, TM, TH, TK;
|
||||
TH = W[4];
|
||||
TJ = TH * TI;
|
||||
TM = TH * TL;
|
||||
TK = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TK, TL, TJ);
|
||||
ci[WS(rs, 3)] = FMA(TK, TI, TM);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T16, T1s, T1k, T1f, T1v, T1p;
|
||||
{
|
||||
E T1j, T15, T1i, T1o, T1e, T1n;
|
||||
T1j = FMA(KP777861913, T1d, T1c);
|
||||
T15 = FNMS(KP777861913, T14, TX);
|
||||
T1i = FMA(KP492403876, T15, TQ);
|
||||
T16 = FNMS(KP984807753, T15, TQ);
|
||||
T1s = FMA(KP852868531, T1j, T1i);
|
||||
T1k = FNMS(KP852868531, T1j, T1i);
|
||||
T1o = FMA(KP777861913, T14, TX);
|
||||
T1e = FNMS(KP777861913, T1d, T1c);
|
||||
T1n = FNMS(KP492403876, T1e, T1b);
|
||||
T1f = FMA(KP984807753, T1e, T1b);
|
||||
T1v = FMA(KP852868531, T1o, T1n);
|
||||
T1p = FNMS(KP852868531, T1o, T1n);
|
||||
}
|
||||
{
|
||||
E TN, T17, T18, T1g;
|
||||
TN = W[0];
|
||||
T17 = TN * T16;
|
||||
T18 = W[1];
|
||||
T1g = T18 * T16;
|
||||
cr[WS(rs, 1)] = FNMS(T18, T1f, T17);
|
||||
ci[WS(rs, 1)] = FMA(TN, T1f, T1g);
|
||||
}
|
||||
{
|
||||
E T1t, T1w, T1r, T1u;
|
||||
T1r = W[6];
|
||||
T1t = T1r * T1s;
|
||||
T1w = T1r * T1v;
|
||||
T1u = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(T1u, T1v, T1t);
|
||||
ci[WS(rs, 4)] = FMA(T1u, T1s, T1w);
|
||||
}
|
||||
{
|
||||
E T1l, T1q, T1h, T1m;
|
||||
T1h = W[12];
|
||||
T1l = T1h * T1k;
|
||||
T1q = T1h * T1p;
|
||||
T1m = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T1m, T1p, T1l);
|
||||
ci[WS(rs, 7)] = FMA(T1m, T1k, T1q);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1W, T1N, T1V, T1G, T20, T1S;
|
||||
T1W = FMA(KP954188894, T1E, T1B);
|
||||
{
|
||||
E T1M, T1R, T1F, T1Q;
|
||||
T1M = FNMS(KP954188894, T1L, T1K);
|
||||
T1N = FMA(KP984807753, T1M, T1J);
|
||||
T1V = FNMS(KP492403876, T1M, T1J);
|
||||
T1R = FMA(KP954188894, T1L, T1K);
|
||||
T1F = FNMS(KP954188894, T1E, T1B);
|
||||
T1Q = FNMS(KP492403876, T1F, T1y);
|
||||
T1G = FMA(KP984807753, T1F, T1y);
|
||||
T20 = FMA(KP852868531, T1R, T1Q);
|
||||
T1S = FNMS(KP852868531, T1R, T1Q);
|
||||
}
|
||||
{
|
||||
E T1H, T1O, T1x, T1I;
|
||||
T1x = W[2];
|
||||
T1H = T1x * T1G;
|
||||
T1O = T1x * T1N;
|
||||
T1I = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T1I, T1N, T1H);
|
||||
ci[WS(rs, 2)] = FMA(T1I, T1G, T1O);
|
||||
}
|
||||
{
|
||||
E T23, T22, T24, T1Z, T21;
|
||||
T23 = FNMS(KP852868531, T1W, T1V);
|
||||
T22 = W[15];
|
||||
T24 = T22 * T20;
|
||||
T1Z = W[14];
|
||||
T21 = T1Z * T20;
|
||||
cr[WS(rs, 8)] = FNMS(T22, T23, T21);
|
||||
ci[WS(rs, 8)] = FMA(T1Z, T23, T24);
|
||||
}
|
||||
{
|
||||
E T1X, T1U, T1Y, T1P, T1T;
|
||||
T1X = FMA(KP852868531, T1W, T1V);
|
||||
T1U = W[9];
|
||||
T1Y = T1U * T1S;
|
||||
T1P = W[8];
|
||||
T1T = T1P * T1S;
|
||||
cr[WS(rs, 5)] = FNMS(T1U, T1X, T1T);
|
||||
ci[WS(rs, 5)] = FMA(T1P, T1X, T1Y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 9, "hb_9", twinstr, &GENUS, { 24, 16, 72, 0 } };
|
||||
|
||||
void X(codelet_hb_9) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_9, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -dif -name hb_9 -include rdft/scalar/hb.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 72 FP multiplications,
|
||||
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
|
||||
* 53 stack variables, 8 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hb.h"
|
||||
|
||||
static void hb_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T5, Tl, TM, T1o, T16, T1y, Ta, Tf, Tg, Tq, Tv, Tw, TT, T17, T1u;
|
||||
E T1A, T1r, T1z, T10, T18;
|
||||
{
|
||||
E T1, Th, T4, T14, Tk, TL, TK, T15;
|
||||
T1 = cr[0];
|
||||
Th = ci[WS(rs, 8)];
|
||||
{
|
||||
E T2, T3, Ti, Tj;
|
||||
T2 = cr[WS(rs, 3)];
|
||||
T3 = ci[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T14 = KP866025403 * (T2 - T3);
|
||||
Ti = ci[WS(rs, 5)];
|
||||
Tj = cr[WS(rs, 6)];
|
||||
Tk = Ti - Tj;
|
||||
TL = KP866025403 * (Ti + Tj);
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
Tl = Th + Tk;
|
||||
TK = FNMS(KP500000000, T4, T1);
|
||||
TM = TK - TL;
|
||||
T1o = TK + TL;
|
||||
T15 = FNMS(KP500000000, Tk, Th);
|
||||
T16 = T14 + T15;
|
||||
T1y = T15 - T14;
|
||||
}
|
||||
{
|
||||
E T6, T9, TN, TQ, Tm, Tp, TO, TR, Tb, Te, TU, TX, Tr, Tu, TV;
|
||||
E TY;
|
||||
{
|
||||
E T7, T8, Tn, To;
|
||||
T6 = cr[WS(rs, 1)];
|
||||
T7 = cr[WS(rs, 4)];
|
||||
T8 = ci[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TN = FNMS(KP500000000, T9, T6);
|
||||
TQ = KP866025403 * (T7 - T8);
|
||||
Tm = ci[WS(rs, 7)];
|
||||
Tn = ci[WS(rs, 4)];
|
||||
To = cr[WS(rs, 7)];
|
||||
Tp = Tn - To;
|
||||
TO = KP866025403 * (Tn + To);
|
||||
TR = FNMS(KP500000000, Tp, Tm);
|
||||
}
|
||||
{
|
||||
E Tc, Td, Ts, Tt;
|
||||
Tb = cr[WS(rs, 2)];
|
||||
Tc = ci[WS(rs, 3)];
|
||||
Td = ci[0];
|
||||
Te = Tc + Td;
|
||||
TU = FNMS(KP500000000, Te, Tb);
|
||||
TX = KP866025403 * (Tc - Td);
|
||||
Tr = ci[WS(rs, 6)];
|
||||
Ts = cr[WS(rs, 5)];
|
||||
Tt = cr[WS(rs, 8)];
|
||||
Tu = Ts + Tt;
|
||||
TV = KP866025403 * (Ts - Tt);
|
||||
TY = FMA(KP500000000, Tu, Tr);
|
||||
}
|
||||
{
|
||||
E TP, TS, T1s, T1t;
|
||||
Ta = T6 + T9;
|
||||
Tf = Tb + Te;
|
||||
Tg = Ta + Tf;
|
||||
Tq = Tm + Tp;
|
||||
Tv = Tr - Tu;
|
||||
Tw = Tq + Tv;
|
||||
TP = TN - TO;
|
||||
TS = TQ + TR;
|
||||
TT = FNMS(KP642787609, TS, KP766044443 * TP);
|
||||
T17 = FMA(KP766044443, TS, KP642787609 * TP);
|
||||
T1s = TU - TV;
|
||||
T1t = TY - TX;
|
||||
T1u = FMA(KP939692620, T1s, KP342020143 * T1t);
|
||||
T1A = FNMS(KP939692620, T1t, KP342020143 * T1s);
|
||||
{
|
||||
E T1p, T1q, TW, TZ;
|
||||
T1p = TN + TO;
|
||||
T1q = TR - TQ;
|
||||
T1r = FNMS(KP984807753, T1q, KP173648177 * T1p);
|
||||
T1z = FMA(KP173648177, T1q, KP984807753 * T1p);
|
||||
TW = TU + TV;
|
||||
TZ = TX + TY;
|
||||
T10 = FNMS(KP984807753, TZ, KP173648177 * TW);
|
||||
T18 = FMA(KP984807753, TW, KP173648177 * TZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
cr[0] = T5 + Tg;
|
||||
ci[0] = Tl + Tw;
|
||||
{
|
||||
E TA, TG, TE, TI;
|
||||
{
|
||||
E Ty, Tz, TC, TD;
|
||||
Ty = FNMS(KP500000000, Tg, T5);
|
||||
Tz = KP866025403 * (Tv - Tq);
|
||||
TA = Ty - Tz;
|
||||
TG = Ty + Tz;
|
||||
TC = FNMS(KP500000000, Tw, Tl);
|
||||
TD = KP866025403 * (Ta - Tf);
|
||||
TE = TC - TD;
|
||||
TI = TD + TC;
|
||||
}
|
||||
{
|
||||
E Tx, TB, TF, TH;
|
||||
Tx = W[10];
|
||||
TB = W[11];
|
||||
cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
|
||||
ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
|
||||
TF = W[4];
|
||||
TH = W[5];
|
||||
cr[WS(rs, 3)] = FNMS(TH, TI, TF * TG);
|
||||
ci[WS(rs, 3)] = FMA(TF, TI, TH * TG);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1d, T1h, T12, T1c, T1a, T1g, T11, T19, TJ, T13;
|
||||
T1d = KP866025403 * (T18 - T17);
|
||||
T1h = KP866025403 * (TT - T10);
|
||||
T11 = TT + T10;
|
||||
T12 = TM + T11;
|
||||
T1c = FNMS(KP500000000, T11, TM);
|
||||
T19 = T17 + T18;
|
||||
T1a = T16 + T19;
|
||||
T1g = FNMS(KP500000000, T19, T16);
|
||||
TJ = W[0];
|
||||
T13 = W[1];
|
||||
cr[WS(rs, 1)] = FNMS(T13, T1a, TJ * T12);
|
||||
ci[WS(rs, 1)] = FMA(T13, T12, TJ * T1a);
|
||||
{
|
||||
E T1k, T1m, T1j, T1l;
|
||||
T1k = T1c + T1d;
|
||||
T1m = T1h + T1g;
|
||||
T1j = W[6];
|
||||
T1l = W[7];
|
||||
cr[WS(rs, 4)] = FNMS(T1l, T1m, T1j * T1k);
|
||||
ci[WS(rs, 4)] = FMA(T1j, T1m, T1l * T1k);
|
||||
}
|
||||
{
|
||||
E T1e, T1i, T1b, T1f;
|
||||
T1e = T1c - T1d;
|
||||
T1i = T1g - T1h;
|
||||
T1b = W[12];
|
||||
T1f = W[13];
|
||||
cr[WS(rs, 7)] = FNMS(T1f, T1i, T1b * T1e);
|
||||
ci[WS(rs, 7)] = FMA(T1b, T1i, T1f * T1e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1F, T1J, T1w, T1E, T1C, T1I, T1v, T1B, T1n, T1x;
|
||||
T1F = KP866025403 * (T1A - T1z);
|
||||
T1J = KP866025403 * (T1r + T1u);
|
||||
T1v = T1r - T1u;
|
||||
T1w = T1o + T1v;
|
||||
T1E = FNMS(KP500000000, T1v, T1o);
|
||||
T1B = T1z + T1A;
|
||||
T1C = T1y + T1B;
|
||||
T1I = FNMS(KP500000000, T1B, T1y);
|
||||
T1n = W[2];
|
||||
T1x = W[3];
|
||||
cr[WS(rs, 2)] = FNMS(T1x, T1C, T1n * T1w);
|
||||
ci[WS(rs, 2)] = FMA(T1n, T1C, T1x * T1w);
|
||||
{
|
||||
E T1M, T1O, T1L, T1N;
|
||||
T1M = T1F + T1E;
|
||||
T1O = T1I + T1J;
|
||||
T1L = W[8];
|
||||
T1N = W[9];
|
||||
cr[WS(rs, 5)] = FNMS(T1N, T1O, T1L * T1M);
|
||||
ci[WS(rs, 5)] = FMA(T1N, T1M, T1L * T1O);
|
||||
}
|
||||
{
|
||||
E T1G, T1K, T1D, T1H;
|
||||
T1G = T1E - T1F;
|
||||
T1K = T1I - T1J;
|
||||
T1D = W[14];
|
||||
T1H = W[15];
|
||||
cr[WS(rs, 8)] = FNMS(T1H, T1K, T1D * T1G);
|
||||
ci[WS(rs, 8)] = FMA(T1H, T1G, T1D * T1K);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 9, "hb_9", twinstr, &GENUS, { 60, 36, 36, 0 } };
|
||||
|
||||
void X(codelet_hb_9) (planner *p) {
|
||||
X(khc2hc_register) (p, hb_9, &desc);
|
||||
}
|
||||
#endif
|
||||
858
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_16.c
Normal file
858
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_16.c
Normal file
@@ -0,0 +1,858 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:09 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 134 FP multiplications,
|
||||
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
|
||||
* 93 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E Tv, Tw, T2z, T2C, TB, TF, Ty, Tz, T1V, TA, T2G, T3Q, T3C, T3g, T3L;
|
||||
E T30, T3m, T3z, T3w, T3s, T1X, T1Y, T2u, T2c, T2p, TE, TG, T1G, T1o, T1D;
|
||||
{
|
||||
E T3f, T3l, T2F, T3r, T2Z, T3v, TD, Tx;
|
||||
Tv = W[0];
|
||||
Tw = W[2];
|
||||
Tx = Tv * Tw;
|
||||
T2z = W[6];
|
||||
T3f = Tv * T2z;
|
||||
T2C = W[7];
|
||||
T3l = Tv * T2C;
|
||||
TB = W[4];
|
||||
T2F = Tv * TB;
|
||||
T3r = Tw * TB;
|
||||
TF = W[5];
|
||||
T2Z = Tv * TF;
|
||||
T3v = Tw * TF;
|
||||
Ty = W[1];
|
||||
Tz = W[3];
|
||||
TD = Tv * Tz;
|
||||
T1V = FMA(Ty, Tz, Tx);
|
||||
TA = FNMS(Ty, Tz, Tx);
|
||||
T2G = FNMS(Ty, TF, T2F);
|
||||
T3Q = FMA(Tz, TB, T3v);
|
||||
T3C = FNMS(Ty, TB, T2Z);
|
||||
T3g = FMA(Ty, T2C, T3f);
|
||||
T3L = FNMS(Tz, TF, T3r);
|
||||
T30 = FMA(Ty, TB, T2Z);
|
||||
T3m = FNMS(Ty, T2z, T3l);
|
||||
T3z = FMA(Ty, TF, T2F);
|
||||
T3w = FNMS(Tz, TB, T3v);
|
||||
T3s = FMA(Tz, TF, T3r);
|
||||
{
|
||||
E T1W, T2b, TC, T1n;
|
||||
T1W = T1V * TB;
|
||||
T2b = T1V * TF;
|
||||
T1X = FNMS(Ty, Tw, TD);
|
||||
T1Y = FNMS(T1X, TF, T1W);
|
||||
T2u = FNMS(T1X, TB, T2b);
|
||||
T2c = FMA(T1X, TB, T2b);
|
||||
T2p = FMA(T1X, TF, T1W);
|
||||
TC = TA * TB;
|
||||
T1n = TA * TF;
|
||||
TE = FMA(Ty, Tw, TD);
|
||||
TG = FNMS(TE, TF, TC);
|
||||
T1G = FNMS(TE, TB, T1n);
|
||||
T1o = FMA(TE, TB, T1n);
|
||||
T1D = FMA(TE, TF, TC);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TL, T1Z, T2d, T1t, T31, T34, T3n, T3D, T3E, T3R, T1w, T20, Tf, T3M, T2L;
|
||||
E T3h, TW, T2e, T3G, T3H, T3N, T2Q, T36, T2V, T37, Tu, T3S, T18, T1z, T24;
|
||||
E T2g, T27, T2h, T1j, T1y;
|
||||
{
|
||||
E T3, TH, T1s, T32, T6, T1p, TK, T33, Ta, TM, TP, T2J, Td, TR, TU;
|
||||
E T2I;
|
||||
{
|
||||
E T1, T2, T1q, T1r;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
TH = T1 - T2;
|
||||
T1q = Ip[0];
|
||||
T1r = Im[WS(rs, 7)];
|
||||
T1s = T1q + T1r;
|
||||
T32 = T1q - T1r;
|
||||
}
|
||||
{
|
||||
E T4, T5, TI, TJ;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T1p = T4 - T5;
|
||||
TI = Ip[WS(rs, 4)];
|
||||
TJ = Im[WS(rs, 3)];
|
||||
TK = TI + TJ;
|
||||
T33 = TI - TJ;
|
||||
}
|
||||
{
|
||||
E T8, T9, TN, TO;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TM = T8 - T9;
|
||||
TN = Ip[WS(rs, 2)];
|
||||
TO = Im[WS(rs, 5)];
|
||||
TP = TN + TO;
|
||||
T2J = TN - TO;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TS, TT;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
TR = Tb - Tc;
|
||||
TS = Ip[WS(rs, 6)];
|
||||
TT = Im[WS(rs, 1)];
|
||||
TU = TS + TT;
|
||||
T2I = TS - TT;
|
||||
}
|
||||
TL = TH - TK;
|
||||
T1Z = TH + TK;
|
||||
T2d = T1s - T1p;
|
||||
T1t = T1p + T1s;
|
||||
T31 = Ta - Td;
|
||||
T34 = T32 - T33;
|
||||
T3n = T34 - T31;
|
||||
{
|
||||
E T1u, T1v, T7, Te;
|
||||
T3D = T32 + T33;
|
||||
T3E = T2J + T2I;
|
||||
T3R = T3D - T3E;
|
||||
T1u = TM + TP;
|
||||
T1v = TR + TU;
|
||||
T1w = T1u - T1v;
|
||||
T20 = T1u + T1v;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T3M = T7 - Te;
|
||||
{
|
||||
E T2H, T2K, TQ, TV;
|
||||
T2H = T3 - T6;
|
||||
T2K = T2I - T2J;
|
||||
T2L = T2H + T2K;
|
||||
T3h = T2H - T2K;
|
||||
TQ = TM - TP;
|
||||
TV = TR - TU;
|
||||
TW = TQ + TV;
|
||||
T2e = TQ - TV;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T1e, T1c, T2N, Tl, T19, T1h, T2O, Tp, T13, T11, T2S, Ts, TY, T16;
|
||||
E T2T, T2M, T2P;
|
||||
{
|
||||
E Tg, Th, T1a, T1b;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T1e = Tg - Th;
|
||||
T1a = Ip[WS(rs, 1)];
|
||||
T1b = Im[WS(rs, 6)];
|
||||
T1c = T1a + T1b;
|
||||
T2N = T1a - T1b;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, T1f, T1g;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
T19 = Tj - Tk;
|
||||
T1f = Ip[WS(rs, 5)];
|
||||
T1g = Im[WS(rs, 2)];
|
||||
T1h = T1f + T1g;
|
||||
T2O = T1f - T1g;
|
||||
}
|
||||
{
|
||||
E Tn, To, TZ, T10;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T13 = Tn - To;
|
||||
TZ = Ip[WS(rs, 7)];
|
||||
T10 = Im[0];
|
||||
T11 = TZ + T10;
|
||||
T2S = TZ - T10;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T14, T15;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TY = Tq - Tr;
|
||||
T14 = Ip[WS(rs, 3)];
|
||||
T15 = Im[WS(rs, 4)];
|
||||
T16 = T14 + T15;
|
||||
T2T = T14 - T15;
|
||||
}
|
||||
T3G = T2N + T2O;
|
||||
T3H = T2S + T2T;
|
||||
T3N = T3H - T3G;
|
||||
T2M = Ti - Tl;
|
||||
T2P = T2N - T2O;
|
||||
T2Q = T2M - T2P;
|
||||
T36 = T2M + T2P;
|
||||
{
|
||||
E T2R, T2U, Tm, Tt;
|
||||
T2R = Tp - Ts;
|
||||
T2U = T2S - T2T;
|
||||
T2V = T2R + T2U;
|
||||
T37 = T2U - T2R;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T3S = Tm - Tt;
|
||||
}
|
||||
{
|
||||
E T12, T17, T22, T23;
|
||||
T12 = TY - T11;
|
||||
T17 = T13 - T16;
|
||||
T18 = FNMS(KP414213562, T17, T12);
|
||||
T1z = FMA(KP414213562, T12, T17);
|
||||
T22 = T1c - T19;
|
||||
T23 = T1e + T1h;
|
||||
T24 = FNMS(KP414213562, T23, T22);
|
||||
T2g = FMA(KP414213562, T22, T23);
|
||||
}
|
||||
{
|
||||
E T25, T26, T1d, T1i;
|
||||
T25 = TY + T11;
|
||||
T26 = T13 + T16;
|
||||
T27 = FNMS(KP414213562, T26, T25);
|
||||
T2h = FMA(KP414213562, T25, T26);
|
||||
T1d = T19 + T1c;
|
||||
T1i = T1e - T1h;
|
||||
T1j = FMA(KP414213562, T1i, T1d);
|
||||
T1y = FNMS(KP414213562, T1d, T1i);
|
||||
}
|
||||
}
|
||||
Rp[0] = Tf + Tu;
|
||||
{
|
||||
E T3B, T3K, T3F, T3I, T3J, T3A;
|
||||
T3A = Tf - Tu;
|
||||
T3B = T3z * T3A;
|
||||
T3K = T3C * T3A;
|
||||
T3F = T3D + T3E;
|
||||
T3I = T3G + T3H;
|
||||
T3J = T3F - T3I;
|
||||
Rm[0] = T3F + T3I;
|
||||
Rm[WS(rs, 4)] = FMA(T3z, T3J, T3K);
|
||||
Rp[WS(rs, 4)] = FNMS(T3C, T3J, T3B);
|
||||
}
|
||||
{
|
||||
E T3O, T3P, T3T, T3U;
|
||||
T3O = T3M - T3N;
|
||||
T3P = T3L * T3O;
|
||||
T3T = T3R - T3S;
|
||||
T3U = T3L * T3T;
|
||||
Rp[WS(rs, 6)] = FNMS(T3Q, T3T, T3P);
|
||||
Rm[WS(rs, 6)] = FMA(T3Q, T3O, T3U);
|
||||
}
|
||||
{
|
||||
E T3V, T3W, T3X, T3Y;
|
||||
T3V = T3M + T3N;
|
||||
T3W = TA * T3V;
|
||||
T3X = T3S + T3R;
|
||||
T3Y = TA * T3X;
|
||||
Rp[WS(rs, 2)] = FNMS(TE, T3X, T3W);
|
||||
Rm[WS(rs, 2)] = FMA(TE, T3V, T3Y);
|
||||
}
|
||||
{
|
||||
E T3j, T3t, T3p, T3x, T3i, T3o;
|
||||
T3i = T37 - T36;
|
||||
T3j = FNMS(KP707106781, T3i, T3h);
|
||||
T3t = FMA(KP707106781, T3i, T3h);
|
||||
T3o = T2Q - T2V;
|
||||
T3p = FNMS(KP707106781, T3o, T3n);
|
||||
T3x = FMA(KP707106781, T3o, T3n);
|
||||
{
|
||||
E T3k, T3q, T3u, T3y;
|
||||
T3k = T3g * T3j;
|
||||
Rp[WS(rs, 7)] = FNMS(T3m, T3p, T3k);
|
||||
T3q = T3g * T3p;
|
||||
Rm[WS(rs, 7)] = FMA(T3m, T3j, T3q);
|
||||
T3u = T3s * T3t;
|
||||
Rp[WS(rs, 3)] = FNMS(T3w, T3x, T3u);
|
||||
T3y = T3s * T3x;
|
||||
Rm[WS(rs, 3)] = FMA(T3w, T3t, T3y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2X, T3b, T39, T3d, T2W, T35, T38;
|
||||
T2W = T2Q + T2V;
|
||||
T2X = FNMS(KP707106781, T2W, T2L);
|
||||
T3b = FMA(KP707106781, T2W, T2L);
|
||||
T35 = T31 + T34;
|
||||
T38 = T36 + T37;
|
||||
T39 = FNMS(KP707106781, T38, T35);
|
||||
T3d = FMA(KP707106781, T38, T35);
|
||||
{
|
||||
E T2Y, T3a, T3c, T3e;
|
||||
T2Y = T2G * T2X;
|
||||
Rp[WS(rs, 5)] = FNMS(T30, T39, T2Y);
|
||||
T3a = T30 * T2X;
|
||||
Rm[WS(rs, 5)] = FMA(T2G, T39, T3a);
|
||||
T3c = T1V * T3b;
|
||||
Rp[WS(rs, 1)] = FNMS(T1X, T3d, T3c);
|
||||
T3e = T1X * T3b;
|
||||
Rm[WS(rs, 1)] = FMA(T1V, T3d, T3e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T29, T2l, T2j, T2n;
|
||||
{
|
||||
E T21, T28, T2f, T2i;
|
||||
T21 = FNMS(KP707106781, T20, T1Z);
|
||||
T28 = T24 + T27;
|
||||
T29 = FMA(KP923879532, T28, T21);
|
||||
T2l = FNMS(KP923879532, T28, T21);
|
||||
T2f = FMA(KP707106781, T2e, T2d);
|
||||
T2i = T2g - T2h;
|
||||
T2j = FNMS(KP923879532, T2i, T2f);
|
||||
T2n = FMA(KP923879532, T2i, T2f);
|
||||
}
|
||||
{
|
||||
E T2a, T2k, T2m, T2o;
|
||||
T2a = T1Y * T29;
|
||||
Ip[WS(rs, 5)] = FNMS(T2c, T2j, T2a);
|
||||
T2k = T2c * T29;
|
||||
Im[WS(rs, 5)] = FMA(T1Y, T2j, T2k);
|
||||
T2m = Tw * T2l;
|
||||
Ip[WS(rs, 1)] = FNMS(Tz, T2n, T2m);
|
||||
T2o = Tz * T2l;
|
||||
Im[WS(rs, 1)] = FMA(Tw, T2n, T2o);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1l, T1E, T1B, T1H;
|
||||
{
|
||||
E TX, T1k, T1x, T1A;
|
||||
TX = FNMS(KP707106781, TW, TL);
|
||||
T1k = T18 - T1j;
|
||||
T1l = FNMS(KP923879532, T1k, TX);
|
||||
T1E = FMA(KP923879532, T1k, TX);
|
||||
T1x = FNMS(KP707106781, T1w, T1t);
|
||||
T1A = T1y - T1z;
|
||||
T1B = FNMS(KP923879532, T1A, T1x);
|
||||
T1H = FMA(KP923879532, T1A, T1x);
|
||||
}
|
||||
{
|
||||
E T1m, T1C, T1F, T1I;
|
||||
T1m = TG * T1l;
|
||||
Ip[WS(rs, 6)] = FNMS(T1o, T1B, T1m);
|
||||
T1C = T1o * T1l;
|
||||
Im[WS(rs, 6)] = FMA(TG, T1B, T1C);
|
||||
T1F = T1D * T1E;
|
||||
Ip[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
|
||||
T1I = T1G * T1E;
|
||||
Im[WS(rs, 2)] = FMA(T1D, T1H, T1I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2s, T2A, T2x, T2D;
|
||||
{
|
||||
E T2q, T2r, T2v, T2w;
|
||||
T2q = FMA(KP707106781, T20, T1Z);
|
||||
T2r = T2g + T2h;
|
||||
T2s = FNMS(KP923879532, T2r, T2q);
|
||||
T2A = FMA(KP923879532, T2r, T2q);
|
||||
T2v = FNMS(KP707106781, T2e, T2d);
|
||||
T2w = T27 - T24;
|
||||
T2x = FMA(KP923879532, T2w, T2v);
|
||||
T2D = FNMS(KP923879532, T2w, T2v);
|
||||
}
|
||||
{
|
||||
E T2t, T2y, T2B, T2E;
|
||||
T2t = T2p * T2s;
|
||||
Ip[WS(rs, 3)] = FNMS(T2u, T2x, T2t);
|
||||
T2y = T2p * T2x;
|
||||
Im[WS(rs, 3)] = FMA(T2u, T2s, T2y);
|
||||
T2B = T2z * T2A;
|
||||
Ip[WS(rs, 7)] = FNMS(T2C, T2D, T2B);
|
||||
T2E = T2z * T2D;
|
||||
Im[WS(rs, 7)] = FMA(T2C, T2A, T2E);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1L, T1R, T1P, T1T;
|
||||
{
|
||||
E T1J, T1K, T1N, T1O;
|
||||
T1J = FMA(KP707106781, TW, TL);
|
||||
T1K = T1y + T1z;
|
||||
T1L = FNMS(KP923879532, T1K, T1J);
|
||||
T1R = FMA(KP923879532, T1K, T1J);
|
||||
T1N = FMA(KP707106781, T1w, T1t);
|
||||
T1O = T1j + T18;
|
||||
T1P = FNMS(KP923879532, T1O, T1N);
|
||||
T1T = FMA(KP923879532, T1O, T1N);
|
||||
}
|
||||
{
|
||||
E T1M, T1Q, T1S, T1U;
|
||||
T1M = TB * T1L;
|
||||
Ip[WS(rs, 4)] = FNMS(TF, T1P, T1M);
|
||||
T1Q = TB * T1P;
|
||||
Im[WS(rs, 4)] = FMA(TF, T1L, T1Q);
|
||||
T1S = Tv * T1R;
|
||||
Ip[0] = FNMS(Ty, T1T, T1S);
|
||||
T1U = Tv * T1T;
|
||||
Im[0] = FMA(Ty, T1R, T1U);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
|
||||
|
||||
void X(codelet_hc2cb2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 108 FP multiplications,
|
||||
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
|
||||
* 80 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
|
||||
E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
|
||||
{
|
||||
E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
|
||||
{
|
||||
E T1m, T1s, T1o, T1r;
|
||||
Tv = W[0];
|
||||
Ty = W[1];
|
||||
T1l = W[2];
|
||||
T1n = W[3];
|
||||
T1m = Tv * T1l;
|
||||
T1s = Ty * T1l;
|
||||
T1o = Ty * T1n;
|
||||
T1r = Tv * T1n;
|
||||
T1p = T1m + T1o;
|
||||
T1t = T1r - T1s;
|
||||
T27 = T1r + T1s;
|
||||
T25 = T1m - T1o;
|
||||
Tz = W[5];
|
||||
TA = Ty * Tz;
|
||||
T1J = T1l * Tz;
|
||||
T15 = Tv * Tz;
|
||||
T1G = T1n * Tz;
|
||||
Tw = W[4];
|
||||
Tx = Tv * Tw;
|
||||
T1K = T1n * Tw;
|
||||
T16 = Ty * Tw;
|
||||
T1F = T1l * Tw;
|
||||
}
|
||||
TB = Tx - TA;
|
||||
T21 = T1J + T1K;
|
||||
T1P = T15 - T16;
|
||||
T1H = T1F + T1G;
|
||||
T1X = T1F - T1G;
|
||||
T17 = T15 + T16;
|
||||
T1L = T1J - T1K;
|
||||
T1N = Tx + TA;
|
||||
T1v = W[6];
|
||||
T1w = W[7];
|
||||
T1x = FMA(Tv, T1v, Ty * T1w);
|
||||
T1B = FNMS(Ty, T1v, Tv * T1w);
|
||||
{
|
||||
E T2D, T2E, T29, T2a;
|
||||
T2D = T25 * Tz;
|
||||
T2E = T27 * Tw;
|
||||
T2F = T2D + T2E;
|
||||
T2T = T2D - T2E;
|
||||
T29 = T25 * Tw;
|
||||
T2a = T27 * Tz;
|
||||
T2b = T29 - T2a;
|
||||
T2R = T29 + T2a;
|
||||
}
|
||||
{
|
||||
E T3h, T3i, T33, T34;
|
||||
T3h = T1p * Tz;
|
||||
T3i = T1t * Tw;
|
||||
T3j = T3h + T3i;
|
||||
T3x = T3h - T3i;
|
||||
T33 = T1p * Tw;
|
||||
T34 = T1t * Tz;
|
||||
T35 = T33 - T34;
|
||||
T3t = T33 + T34;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
|
||||
E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
|
||||
E T3e, T3o;
|
||||
{
|
||||
E T3, T2c, T1b, T2H, T6, T2G, T1e, T2d;
|
||||
{
|
||||
E T1, T2, T19, T1a;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T2c = T1 - T2;
|
||||
T19 = Ip[0];
|
||||
T1a = Im[WS(rs, 7)];
|
||||
T1b = T19 - T1a;
|
||||
T2H = T19 + T1a;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1c, T1d;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T2G = T4 - T5;
|
||||
T1c = Ip[WS(rs, 4)];
|
||||
T1d = Im[WS(rs, 3)];
|
||||
T1e = T1c - T1d;
|
||||
T2d = T1c + T1d;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T36 = T2c + T2d;
|
||||
T3k = T2H - T2G;
|
||||
TC = T3 - T6;
|
||||
T1f = T1b - T1e;
|
||||
T2e = T2c - T2d;
|
||||
T2I = T2G + T2H;
|
||||
T1Q = T1b + T1e;
|
||||
}
|
||||
{
|
||||
E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
|
||||
{
|
||||
E T8, T9, TG, TH;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
T2f = T8 - T9;
|
||||
TG = Ip[WS(rs, 2)];
|
||||
TH = Im[WS(rs, 5)];
|
||||
TI = TG - TH;
|
||||
T2g = TG + TH;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TD, TE;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
T2i = Tb - Tc;
|
||||
TD = Ip[WS(rs, 6)];
|
||||
TE = Im[WS(rs, 1)];
|
||||
TF = TD - TE;
|
||||
T2j = TD + TE;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TJ = TF - TI;
|
||||
T1R = TI + TF;
|
||||
T18 = Ta - Td;
|
||||
{
|
||||
E T2J, T2K, T2h, T2k;
|
||||
T2J = T2f + T2g;
|
||||
T2K = T2i + T2j;
|
||||
T2L = KP707106781 * (T2J - T2K);
|
||||
T37 = KP707106781 * (T2J + T2K);
|
||||
T2h = T2f - T2g;
|
||||
T2k = T2i - T2j;
|
||||
T2l = KP707106781 * (T2h + T2k);
|
||||
T3l = KP707106781 * (T2h - T2k);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T2x, TO, T2v, Tl, T2u, TR, T2y, TL, TS;
|
||||
{
|
||||
E Tg, Th, TM, TN;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T2x = Tg - Th;
|
||||
TM = Ip[WS(rs, 1)];
|
||||
TN = Im[WS(rs, 6)];
|
||||
TO = TM - TN;
|
||||
T2v = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TP, TQ;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
T2u = Tj - Tk;
|
||||
TP = Ip[WS(rs, 5)];
|
||||
TQ = Im[WS(rs, 2)];
|
||||
TR = TP - TQ;
|
||||
T2y = TP + TQ;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1T = TO + TR;
|
||||
TL = Ti - Tl;
|
||||
TS = TO - TR;
|
||||
TT = TL - TS;
|
||||
T1h = TL + TS;
|
||||
{
|
||||
E T2w, T2z, T39, T3a;
|
||||
T2w = T2u + T2v;
|
||||
T2z = T2x - T2y;
|
||||
T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
|
||||
T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
|
||||
T39 = T2x + T2y;
|
||||
T3a = T2v - T2u;
|
||||
T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
|
||||
T3n = FMA(KP382683432, T3a, KP923879532 * T39);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, T2q, TX, T2o, Ts, T2n, T10, T2r, TU, T11;
|
||||
{
|
||||
E Tn, To, TV, TW;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T2q = Tn - To;
|
||||
TV = Ip[WS(rs, 7)];
|
||||
TW = Im[0];
|
||||
TX = TV - TW;
|
||||
T2o = TV + TW;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, TY, TZ;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
T2n = Tq - Tr;
|
||||
TY = Ip[WS(rs, 3)];
|
||||
TZ = Im[WS(rs, 4)];
|
||||
T10 = TY - TZ;
|
||||
T2r = TY + TZ;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T1U = TX + T10;
|
||||
TU = Tp - Ts;
|
||||
T11 = TX - T10;
|
||||
T12 = TU + T11;
|
||||
T1i = T11 - TU;
|
||||
{
|
||||
E T2p, T2s, T3c, T3d;
|
||||
T2p = T2n - T2o;
|
||||
T2s = T2q - T2r;
|
||||
T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
|
||||
T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
|
||||
T3c = T2q + T2r;
|
||||
T3d = T2n + T2o;
|
||||
T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
|
||||
T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T1O, T1S, T1V, T1W;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
T1O = Tf - Tu;
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T1W = T1S - T1V;
|
||||
Rp[0] = Tf + Tu;
|
||||
Rm[0] = T1S + T1V;
|
||||
Rp[WS(rs, 4)] = FNMS(T1P, T1W, T1N * T1O);
|
||||
Rm[WS(rs, 4)] = FMA(T1P, T1O, T1N * T1W);
|
||||
}
|
||||
{
|
||||
E T3g, T3r, T3q, T3s;
|
||||
{
|
||||
E T38, T3f, T3m, T3p;
|
||||
T38 = T36 - T37;
|
||||
T3f = T3b + T3e;
|
||||
T3g = T38 - T3f;
|
||||
T3r = T38 + T3f;
|
||||
T3m = T3k + T3l;
|
||||
T3p = T3n - T3o;
|
||||
T3q = T3m - T3p;
|
||||
T3s = T3m + T3p;
|
||||
}
|
||||
Ip[WS(rs, 5)] = FNMS(T3j, T3q, T35 * T3g);
|
||||
Im[WS(rs, 5)] = FMA(T3j, T3g, T35 * T3q);
|
||||
Ip[WS(rs, 1)] = FNMS(T1n, T3s, T1l * T3r);
|
||||
Im[WS(rs, 1)] = FMA(T1n, T3r, T1l * T3s);
|
||||
}
|
||||
{
|
||||
E T3w, T3B, T3A, T3C;
|
||||
{
|
||||
E T3u, T3v, T3y, T3z;
|
||||
T3u = T36 + T37;
|
||||
T3v = T3n + T3o;
|
||||
T3w = T3u - T3v;
|
||||
T3B = T3u + T3v;
|
||||
T3y = T3k - T3l;
|
||||
T3z = T3b - T3e;
|
||||
T3A = T3y + T3z;
|
||||
T3C = T3y - T3z;
|
||||
}
|
||||
Ip[WS(rs, 3)] = FNMS(T3x, T3A, T3t * T3w);
|
||||
Im[WS(rs, 3)] = FMA(T3t, T3A, T3x * T3w);
|
||||
Ip[WS(rs, 7)] = FNMS(T1w, T3C, T1v * T3B);
|
||||
Im[WS(rs, 7)] = FMA(T1v, T3C, T1w * T3B);
|
||||
}
|
||||
{
|
||||
E T14, T1q, T1k, T1u;
|
||||
{
|
||||
E TK, T13, T1g, T1j;
|
||||
TK = TC + TJ;
|
||||
T13 = KP707106781 * (TT + T12);
|
||||
T14 = TK - T13;
|
||||
T1q = TK + T13;
|
||||
T1g = T18 + T1f;
|
||||
T1j = KP707106781 * (T1h + T1i);
|
||||
T1k = T1g - T1j;
|
||||
T1u = T1g + T1j;
|
||||
}
|
||||
Rp[WS(rs, 5)] = FNMS(T17, T1k, TB * T14);
|
||||
Rm[WS(rs, 5)] = FMA(T17, T14, TB * T1k);
|
||||
Rp[WS(rs, 1)] = FNMS(T1t, T1u, T1p * T1q);
|
||||
Rm[WS(rs, 1)] = FMA(T1t, T1q, T1p * T1u);
|
||||
}
|
||||
{
|
||||
E T1A, T1I, T1E, T1M;
|
||||
{
|
||||
E T1y, T1z, T1C, T1D;
|
||||
T1y = TC - TJ;
|
||||
T1z = KP707106781 * (T1i - T1h);
|
||||
T1A = T1y - T1z;
|
||||
T1I = T1y + T1z;
|
||||
T1C = T1f - T18;
|
||||
T1D = KP707106781 * (TT - T12);
|
||||
T1E = T1C - T1D;
|
||||
T1M = T1C + T1D;
|
||||
}
|
||||
Rp[WS(rs, 7)] = FNMS(T1B, T1E, T1x * T1A);
|
||||
Rm[WS(rs, 7)] = FMA(T1x, T1E, T1B * T1A);
|
||||
Rp[WS(rs, 3)] = FNMS(T1L, T1M, T1H * T1I);
|
||||
Rm[WS(rs, 3)] = FMA(T1H, T1M, T1L * T1I);
|
||||
}
|
||||
{
|
||||
E T2C, T2S, T2Q, T2U;
|
||||
{
|
||||
E T2m, T2B, T2M, T2P;
|
||||
T2m = T2e - T2l;
|
||||
T2B = T2t - T2A;
|
||||
T2C = T2m - T2B;
|
||||
T2S = T2m + T2B;
|
||||
T2M = T2I - T2L;
|
||||
T2P = T2N - T2O;
|
||||
T2Q = T2M - T2P;
|
||||
T2U = T2M + T2P;
|
||||
}
|
||||
Ip[WS(rs, 6)] = FNMS(T2F, T2Q, T2b * T2C);
|
||||
Im[WS(rs, 6)] = FMA(T2F, T2C, T2b * T2Q);
|
||||
Ip[WS(rs, 2)] = FNMS(T2T, T2U, T2R * T2S);
|
||||
Im[WS(rs, 2)] = FMA(T2T, T2S, T2R * T2U);
|
||||
}
|
||||
{
|
||||
E T2X, T31, T30, T32;
|
||||
{
|
||||
E T2V, T2W, T2Y, T2Z;
|
||||
T2V = T2e + T2l;
|
||||
T2W = T2N + T2O;
|
||||
T2X = T2V - T2W;
|
||||
T31 = T2V + T2W;
|
||||
T2Y = T2I + T2L;
|
||||
T2Z = T2A + T2t;
|
||||
T30 = T2Y - T2Z;
|
||||
T32 = T2Y + T2Z;
|
||||
}
|
||||
Ip[WS(rs, 4)] = FNMS(Tz, T30, Tw * T2X);
|
||||
Im[WS(rs, 4)] = FMA(Tw, T30, Tz * T2X);
|
||||
Ip[0] = FNMS(Ty, T32, Tv * T31);
|
||||
Im[0] = FMA(Tv, T32, Ty * T31);
|
||||
}
|
||||
{
|
||||
E T20, T26, T24, T28;
|
||||
{
|
||||
E T1Y, T1Z, T22, T23;
|
||||
T1Y = T7 - Te;
|
||||
T1Z = T1U - T1T;
|
||||
T20 = T1Y - T1Z;
|
||||
T26 = T1Y + T1Z;
|
||||
T22 = T1Q - T1R;
|
||||
T23 = Tm - Tt;
|
||||
T24 = T22 - T23;
|
||||
T28 = T23 + T22;
|
||||
}
|
||||
Rp[WS(rs, 6)] = FNMS(T21, T24, T1X * T20);
|
||||
Rm[WS(rs, 6)] = FMA(T1X, T24, T21 * T20);
|
||||
Rp[WS(rs, 2)] = FNMS(T27, T28, T25 * T26);
|
||||
Rm[WS(rs, 2)] = FMA(T25, T28, T27 * T26);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
|
||||
|
||||
void X(codelet_hc2cb2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
1087
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_20.c
Normal file
1087
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1882
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_32.c
Normal file
1882
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
194
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_4.c
Normal file
194
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_4.c
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:09 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hc2cb2_4 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 33 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, Tb, T8, Ta, Tc, Tg, T9, Tf;
|
||||
T7 = W[0];
|
||||
Tb = W[3];
|
||||
T8 = W[2];
|
||||
T9 = T7 * T8;
|
||||
Tf = T7 * Tb;
|
||||
Ta = W[1];
|
||||
Tc = FMA(Ta, Tb, T9);
|
||||
Tg = FNMS(Ta, T8, Tf);
|
||||
{
|
||||
E T3, T6, Td, Tj, Tz, Tx, Tr, Tm, Tv, Ts, Tw, TA;
|
||||
{
|
||||
E Th, Ti, Tu, Tp, Tk, Tl, Tq, Tt;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
Th = Ip[0];
|
||||
Ti = Im[WS(rs, 1)];
|
||||
Tu = Th + Ti;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tp = T1 - T2;
|
||||
Tk = Ip[WS(rs, 1)];
|
||||
Tl = Im[0];
|
||||
Tq = Tk + Tl;
|
||||
T4 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
Tt = T4 - T5;
|
||||
}
|
||||
Td = T3 - T6;
|
||||
Tj = Th - Ti;
|
||||
Tz = Tu - Tt;
|
||||
Tx = Tp + Tq;
|
||||
Tr = Tp - Tq;
|
||||
Tm = Tk - Tl;
|
||||
Tv = Tt + Tu;
|
||||
}
|
||||
Rp[0] = T3 + T6;
|
||||
Rm[0] = Tj + Tm;
|
||||
Ts = T7 * Tr;
|
||||
Ip[0] = FNMS(Ta, Tv, Ts);
|
||||
Tw = T7 * Tv;
|
||||
Im[0] = FMA(Ta, Tr, Tw);
|
||||
TA = T8 * Tz;
|
||||
Im[WS(rs, 1)] = FMA(Tb, Tx, TA);
|
||||
{
|
||||
E Ty, Te, To, Tn;
|
||||
Ty = T8 * Tx;
|
||||
Ip[WS(rs, 1)] = FNMS(Tb, Tz, Ty);
|
||||
Te = Tc * Td;
|
||||
To = Tg * Td;
|
||||
Tn = Tj - Tm;
|
||||
Rp[WS(rs, 1)] = FNMS(Tg, Tn, Te);
|
||||
Rm[WS(rs, 1)] = FMA(Tc, Tn, To);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hc2cb2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb2_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hc2cb2_4 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, T9, T8, Ta, Tb, Td;
|
||||
T7 = W[0];
|
||||
T9 = W[1];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Td = FNMS(T9, T8, T7 * Ta);
|
||||
{
|
||||
E T3, Tl, Tg, Tp, T6, To, Tj, Tm, Tc, Tk;
|
||||
{
|
||||
E T1, T2, Te, Tf;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tl = T1 - T2;
|
||||
Te = Ip[0];
|
||||
Tf = Im[WS(rs, 1)];
|
||||
Tg = Te - Tf;
|
||||
Tp = Te + Tf;
|
||||
}
|
||||
{
|
||||
E T4, T5, Th, Ti;
|
||||
T4 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
To = T4 - T5;
|
||||
Th = Ip[WS(rs, 1)];
|
||||
Ti = Im[0];
|
||||
Tj = Th - Ti;
|
||||
Tm = Th + Ti;
|
||||
}
|
||||
Rp[0] = T3 + T6;
|
||||
Rm[0] = Tg + Tj;
|
||||
Tc = T3 - T6;
|
||||
Tk = Tg - Tj;
|
||||
Rp[WS(rs, 1)] = FNMS(Td, Tk, Tb * Tc);
|
||||
Rm[WS(rs, 1)] = FMA(Td, Tc, Tb * Tk);
|
||||
{
|
||||
E Tn, Tq, Tr, Ts;
|
||||
Tn = Tl - Tm;
|
||||
Tq = To + Tp;
|
||||
Ip[0] = FNMS(T9, Tq, T7 * Tn);
|
||||
Im[0] = FMA(T7, Tq, T9 * Tn);
|
||||
Tr = Tl + Tm;
|
||||
Ts = Tp - To;
|
||||
Ip[WS(rs, 1)] = FNMS(Ta, Ts, T8 * Tr);
|
||||
Im[WS(rs, 1)] = FMA(T8, Ts, Ta * Tr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hc2cb2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb2_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
387
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_8.c
Normal file
387
fftw-3.3.10/rdft/scalar/r2cb/hc2cb2_8.c
Normal file
@@ -0,0 +1,387 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:09 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 50 FP multiplications,
|
||||
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Tf, Tg, Tl, Tp, Ti, Tj, Tk, T1b, T1u, T1e, T1o, To, Tq, TK;
|
||||
{
|
||||
E Th, T1n, T1t, Tn, Tm, TJ;
|
||||
Tf = W[0];
|
||||
Tg = W[2];
|
||||
Th = Tf * Tg;
|
||||
Tl = W[4];
|
||||
T1n = Tf * Tl;
|
||||
Tp = W[5];
|
||||
T1t = Tf * Tp;
|
||||
Ti = W[1];
|
||||
Tj = W[3];
|
||||
Tn = Tf * Tj;
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1b = FNMS(Ti, Tj, Th);
|
||||
T1u = FNMS(Ti, Tl, T1t);
|
||||
T1e = FMA(Ti, Tg, Tn);
|
||||
T1o = FMA(Ti, Tp, T1n);
|
||||
Tm = Tk * Tl;
|
||||
TJ = Tk * Tp;
|
||||
To = FNMS(Ti, Tg, Tn);
|
||||
Tq = FMA(To, Tp, Tm);
|
||||
TK = FNMS(To, Tl, TJ);
|
||||
}
|
||||
{
|
||||
E T7, T1p, T1v, Tv, TP, T13, T1h, TZ, Te, T1k, T1w, T1q, TQ, TR, T10;
|
||||
E TG, T14;
|
||||
{
|
||||
E T3, Tr, TO, T1f, T6, TL, Tu, T1g;
|
||||
{
|
||||
E T1, T2, TM, TN;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
Tr = T1 - T2;
|
||||
TM = Ip[0];
|
||||
TN = Im[WS(rs, 3)];
|
||||
TO = TM + TN;
|
||||
T1f = TM - TN;
|
||||
}
|
||||
{
|
||||
E T4, T5, Ts, Tt;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
TL = T4 - T5;
|
||||
Ts = Ip[WS(rs, 2)];
|
||||
Tt = Im[WS(rs, 1)];
|
||||
Tu = Ts + Tt;
|
||||
T1g = Ts - Tt;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1p = T3 - T6;
|
||||
T1v = T1f - T1g;
|
||||
Tv = Tr - Tu;
|
||||
TP = TL + TO;
|
||||
T13 = TO - TL;
|
||||
T1h = T1f + T1g;
|
||||
TZ = Tr + Tu;
|
||||
}
|
||||
{
|
||||
E Ta, Tw, Tz, T1i, Td, TB, TE, T1j, TA, TF;
|
||||
{
|
||||
E T8, T9, Tx, Ty;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tw = T8 - T9;
|
||||
Tx = Ip[WS(rs, 1)];
|
||||
Ty = Im[WS(rs, 2)];
|
||||
Tz = Tx + Ty;
|
||||
T1i = Tx - Ty;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TC, TD;
|
||||
Tb = Rm[0];
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
TB = Tb - Tc;
|
||||
TC = Ip[WS(rs, 3)];
|
||||
TD = Im[0];
|
||||
TE = TC + TD;
|
||||
T1j = TC - TD;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1k = T1i + T1j;
|
||||
T1w = Ta - Td;
|
||||
T1q = T1j - T1i;
|
||||
TQ = Tw + Tz;
|
||||
TR = TB + TE;
|
||||
T10 = TQ + TR;
|
||||
TA = Tw - Tz;
|
||||
TF = TB - TE;
|
||||
TG = TA + TF;
|
||||
T14 = TA - TF;
|
||||
}
|
||||
Rp[0] = T7 + Te;
|
||||
Rm[0] = T1h + T1k;
|
||||
{
|
||||
E T11, T12, T15, T16;
|
||||
T11 = FNMS(KP707106781, T10, TZ);
|
||||
T12 = Tg * T11;
|
||||
T15 = FMA(KP707106781, T14, T13);
|
||||
T16 = Tg * T15;
|
||||
Ip[WS(rs, 1)] = FNMS(Tj, T15, T12);
|
||||
Im[WS(rs, 1)] = FMA(Tj, T11, T16);
|
||||
}
|
||||
{
|
||||
E T1z, T1A, T1B, T1C;
|
||||
T1z = T1p + T1q;
|
||||
T1A = Tk * T1z;
|
||||
T1B = T1w + T1v;
|
||||
T1C = Tk * T1B;
|
||||
Rp[WS(rs, 1)] = FNMS(To, T1B, T1A);
|
||||
Rm[WS(rs, 1)] = FMA(To, T1z, T1C);
|
||||
}
|
||||
{
|
||||
E T17, T18, T19, T1a;
|
||||
T17 = FMA(KP707106781, T10, TZ);
|
||||
T18 = Tl * T17;
|
||||
T19 = FNMS(KP707106781, T14, T13);
|
||||
T1a = Tl * T19;
|
||||
Ip[WS(rs, 3)] = FNMS(Tp, T19, T18);
|
||||
Im[WS(rs, 3)] = FMA(Tp, T17, T1a);
|
||||
}
|
||||
{
|
||||
E T1l, T1d, T1m, T1c;
|
||||
T1l = T1h - T1k;
|
||||
T1c = T7 - Te;
|
||||
T1d = T1b * T1c;
|
||||
T1m = T1e * T1c;
|
||||
Rp[WS(rs, 2)] = FNMS(T1e, T1l, T1d);
|
||||
Rm[WS(rs, 2)] = FMA(T1b, T1l, T1m);
|
||||
}
|
||||
{
|
||||
E T1r, T1s, T1x, T1y;
|
||||
T1r = T1p - T1q;
|
||||
T1s = T1o * T1r;
|
||||
T1x = T1v - T1w;
|
||||
T1y = T1o * T1x;
|
||||
Rp[WS(rs, 3)] = FNMS(T1u, T1x, T1s);
|
||||
Rm[WS(rs, 3)] = FMA(T1u, T1r, T1y);
|
||||
}
|
||||
{
|
||||
E TT, TX, TW, TY, TI, TU, TS, TV, TH;
|
||||
TS = TQ - TR;
|
||||
TT = FNMS(KP707106781, TS, TP);
|
||||
TX = FMA(KP707106781, TS, TP);
|
||||
TV = FMA(KP707106781, TG, Tv);
|
||||
TW = Tf * TV;
|
||||
TY = Ti * TV;
|
||||
TH = FNMS(KP707106781, TG, Tv);
|
||||
TI = Tq * TH;
|
||||
TU = TK * TH;
|
||||
Ip[WS(rs, 2)] = FNMS(TK, TT, TI);
|
||||
Im[WS(rs, 2)] = FMA(Tq, TT, TU);
|
||||
Ip[0] = FNMS(Ti, TX, TW);
|
||||
Im[0] = FMA(Tf, TX, TY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cb2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 44 FP multiplications,
|
||||
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
|
||||
* 46 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
|
||||
{
|
||||
E Th, To, Tk, Tn;
|
||||
Tf = W[0];
|
||||
Ti = W[1];
|
||||
Tg = W[2];
|
||||
Tj = W[3];
|
||||
Th = Tf * Tg;
|
||||
To = Ti * Tg;
|
||||
Tk = Ti * Tj;
|
||||
Tn = Tf * Tj;
|
||||
Tl = Th - Tk;
|
||||
Tp = Tn + To;
|
||||
TP = Th + Tk;
|
||||
TR = Tn - To;
|
||||
TF = W[4];
|
||||
TG = W[5];
|
||||
TH = FMA(Tf, TF, Ti * TG);
|
||||
T15 = FNMS(TR, TF, TP * TG);
|
||||
TL = FNMS(Ti, TF, Tf * TG);
|
||||
TT = FMA(TP, TF, TR * TG);
|
||||
}
|
||||
{
|
||||
E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
|
||||
E TN, Tm, TE;
|
||||
{
|
||||
E T3, TU, Ts, T17, T6, T16, Tv, TV;
|
||||
{
|
||||
E T1, T2, Tq, Tr;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
TU = T1 - T2;
|
||||
Tq = Ip[0];
|
||||
Tr = Im[WS(rs, 3)];
|
||||
Ts = Tq - Tr;
|
||||
T17 = Tq + Tr;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tt, Tu;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
T16 = T4 - T5;
|
||||
Tt = Ip[WS(rs, 2)];
|
||||
Tu = Im[WS(rs, 1)];
|
||||
Tv = Tt - Tu;
|
||||
TV = Tt + Tu;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1f = TU + TV;
|
||||
T1i = T17 - T16;
|
||||
Tw = Ts + Tv;
|
||||
TI = T3 - T6;
|
||||
TW = TU - TV;
|
||||
T18 = T16 + T17;
|
||||
TM = Ts - Tv;
|
||||
}
|
||||
{
|
||||
E Ta, TX, Tz, TY, Td, T10, TC, T11;
|
||||
{
|
||||
E T8, T9, Tx, Ty;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
TX = T8 - T9;
|
||||
Tx = Ip[WS(rs, 1)];
|
||||
Ty = Im[WS(rs, 2)];
|
||||
Tz = Tx - Ty;
|
||||
TY = Tx + Ty;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TA, TB;
|
||||
Tb = Rm[0];
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
T10 = Tb - Tc;
|
||||
TA = Ip[WS(rs, 3)];
|
||||
TB = Im[0];
|
||||
TC = TA - TB;
|
||||
T11 = TA + TB;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T19 = TX + TY;
|
||||
T1a = T10 + T11;
|
||||
TD = Tz + TC;
|
||||
TJ = TC - Tz;
|
||||
TZ = TX - TY;
|
||||
T12 = T10 - T11;
|
||||
TN = Ta - Td;
|
||||
}
|
||||
Rp[0] = T7 + Te;
|
||||
Rm[0] = Tw + TD;
|
||||
Tm = T7 - Te;
|
||||
TE = Tw - TD;
|
||||
Rp[WS(rs, 2)] = FNMS(Tp, TE, Tl * Tm);
|
||||
Rm[WS(rs, 2)] = FMA(Tp, Tm, Tl * TE);
|
||||
{
|
||||
E TQ, TS, TK, TO;
|
||||
TQ = TI + TJ;
|
||||
TS = TN + TM;
|
||||
Rp[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
|
||||
Rm[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
|
||||
TK = TI - TJ;
|
||||
TO = TM - TN;
|
||||
Rp[WS(rs, 3)] = FNMS(TL, TO, TH * TK);
|
||||
Rm[WS(rs, 3)] = FMA(TH, TO, TL * TK);
|
||||
}
|
||||
{
|
||||
E T1h, T1l, T1k, T1m, T1g, T1j;
|
||||
T1g = KP707106781 * (T19 + T1a);
|
||||
T1h = T1f - T1g;
|
||||
T1l = T1f + T1g;
|
||||
T1j = KP707106781 * (TZ - T12);
|
||||
T1k = T1i + T1j;
|
||||
T1m = T1i - T1j;
|
||||
Ip[WS(rs, 1)] = FNMS(Tj, T1k, Tg * T1h);
|
||||
Im[WS(rs, 1)] = FMA(Tg, T1k, Tj * T1h);
|
||||
Ip[WS(rs, 3)] = FNMS(TG, T1m, TF * T1l);
|
||||
Im[WS(rs, 3)] = FMA(TF, T1m, TG * T1l);
|
||||
}
|
||||
{
|
||||
E T14, T1d, T1c, T1e, T13, T1b;
|
||||
T13 = KP707106781 * (TZ + T12);
|
||||
T14 = TW - T13;
|
||||
T1d = TW + T13;
|
||||
T1b = KP707106781 * (T19 - T1a);
|
||||
T1c = T18 - T1b;
|
||||
T1e = T18 + T1b;
|
||||
Ip[WS(rs, 2)] = FNMS(T15, T1c, TT * T14);
|
||||
Im[WS(rs, 2)] = FMA(T15, T14, TT * T1c);
|
||||
Ip[0] = FNMS(Ti, T1e, Tf * T1d);
|
||||
Im[0] = FMA(Ti, T1d, Tf * T1e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
|
||||
|
||||
void X(codelet_hc2cb2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
513
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_10.c
Normal file
513
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_10.c
Normal file
@@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cb_10 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 72 FP multiplications,
|
||||
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 47 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E TH, T1B, TB, T11, T1E, T1G, TK, TM, T1x, T1V, T3, T1g, Tl, T1I, T1J;
|
||||
E TO, TP, T1p, Ti, Tk, T1n, T1o, TF, TG;
|
||||
TF = Ip[0];
|
||||
TG = Im[WS(rs, 4)];
|
||||
TH = TF - TG;
|
||||
T1B = TF + TG;
|
||||
{
|
||||
E Tp, T1u, Tz, T1s, Ts, T1v, Tw, T1r;
|
||||
{
|
||||
E Tn, To, Tx, Ty;
|
||||
Tn = Ip[WS(rs, 4)];
|
||||
To = Im[0];
|
||||
Tp = Tn - To;
|
||||
T1u = Tn + To;
|
||||
Tx = Ip[WS(rs, 3)];
|
||||
Ty = Im[WS(rs, 1)];
|
||||
Tz = Tx - Ty;
|
||||
T1s = Tx + Ty;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, Tu, Tv;
|
||||
Tq = Ip[WS(rs, 1)];
|
||||
Tr = Im[WS(rs, 3)];
|
||||
Ts = Tq - Tr;
|
||||
T1v = Tq + Tr;
|
||||
Tu = Ip[WS(rs, 2)];
|
||||
Tv = Im[WS(rs, 2)];
|
||||
Tw = Tu - Tv;
|
||||
T1r = Tu + Tv;
|
||||
}
|
||||
{
|
||||
E Tt, TA, T1C, T1D;
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
TB = FNMS(KP618033988, TA, Tt);
|
||||
T11 = FMA(KP618033988, Tt, TA);
|
||||
T1C = T1r - T1s;
|
||||
T1D = T1u - T1v;
|
||||
T1E = T1C + T1D;
|
||||
T1G = T1C - T1D;
|
||||
}
|
||||
{
|
||||
E TI, TJ, T1t, T1w;
|
||||
TI = Tw + Tz;
|
||||
TJ = Tp + Ts;
|
||||
TK = TI + TJ;
|
||||
TM = TI - TJ;
|
||||
T1t = T1r + T1s;
|
||||
T1w = T1u + T1v;
|
||||
T1x = FMA(KP618033988, T1w, T1t);
|
||||
T1V = FNMS(KP618033988, T1t, T1w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Td, T1k, Tg, T1l, Th, T1m, T6, T1h, T9, T1i, Ta, T1j, T1, T2;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
T1g = T1 - T2;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = Rp[WS(rs, 4)];
|
||||
Tc = Rm[0];
|
||||
Td = Tb + Tc;
|
||||
T1k = Tb - Tc;
|
||||
Te = Rm[WS(rs, 3)];
|
||||
Tf = Rp[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
T1l = Te - Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
T1m = T1k + T1l;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 2)];
|
||||
T6 = T4 + T5;
|
||||
T1h = T4 - T5;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
T1i = T7 - T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1j = T1h + T1i;
|
||||
Tl = Ta - Th;
|
||||
T1I = T1h - T1i;
|
||||
T1J = T1k - T1l;
|
||||
TO = Td - Tg;
|
||||
TP = T6 - T9;
|
||||
T1p = T1j - T1m;
|
||||
Ti = Ta + Th;
|
||||
Tk = FNMS(KP250000000, Ti, T3);
|
||||
T1n = T1j + T1m;
|
||||
T1o = FNMS(KP250000000, T1n, T1g);
|
||||
}
|
||||
Rp[0] = T3 + Ti;
|
||||
Rm[0] = TH + TK;
|
||||
{
|
||||
E T2d, T29, T2b, T2c, T2e, T2a;
|
||||
T2d = T1B + T1E;
|
||||
T2a = T1g + T1n;
|
||||
T29 = W[8];
|
||||
T2b = T29 * T2a;
|
||||
T2c = W[9];
|
||||
T2e = T2c * T2a;
|
||||
Ip[WS(rs, 2)] = FNMS(T2c, T2d, T2b);
|
||||
Im[WS(rs, 2)] = FMA(T29, T2d, T2e);
|
||||
}
|
||||
{
|
||||
E TQ, T16, TC, TU, TN, T15, T12, T1a, Tm, TL, T10;
|
||||
TQ = FNMS(KP618033988, TP, TO);
|
||||
T16 = FMA(KP618033988, TO, TP);
|
||||
Tm = FNMS(KP559016994, Tl, Tk);
|
||||
TC = FMA(KP951056516, TB, Tm);
|
||||
TU = FNMS(KP951056516, TB, Tm);
|
||||
TL = FNMS(KP250000000, TK, TH);
|
||||
TN = FNMS(KP559016994, TM, TL);
|
||||
T15 = FMA(KP559016994, TM, TL);
|
||||
T10 = FMA(KP559016994, Tl, Tk);
|
||||
T12 = FMA(KP951056516, T11, T10);
|
||||
T1a = FNMS(KP951056516, T11, T10);
|
||||
{
|
||||
E TR, TE, TS, Tj, TD;
|
||||
TR = FNMS(KP951056516, TQ, TN);
|
||||
TE = W[3];
|
||||
TS = TE * TC;
|
||||
Tj = W[2];
|
||||
TD = Tj * TC;
|
||||
Rp[WS(rs, 1)] = FNMS(TE, TR, TD);
|
||||
Rm[WS(rs, 1)] = FMA(Tj, TR, TS);
|
||||
}
|
||||
{
|
||||
E T1d, T1c, T1e, T19, T1b;
|
||||
T1d = FMA(KP951056516, T16, T15);
|
||||
T1c = W[11];
|
||||
T1e = T1c * T1a;
|
||||
T19 = W[10];
|
||||
T1b = T19 * T1a;
|
||||
Rp[WS(rs, 3)] = FNMS(T1c, T1d, T1b);
|
||||
Rm[WS(rs, 3)] = FMA(T19, T1d, T1e);
|
||||
}
|
||||
{
|
||||
E TX, TW, TY, TT, TV;
|
||||
TX = FMA(KP951056516, TQ, TN);
|
||||
TW = W[15];
|
||||
TY = TW * TU;
|
||||
TT = W[14];
|
||||
TV = TT * TU;
|
||||
Rp[WS(rs, 4)] = FNMS(TW, TX, TV);
|
||||
Rm[WS(rs, 4)] = FMA(TT, TX, TY);
|
||||
}
|
||||
{
|
||||
E T17, T14, T18, TZ, T13;
|
||||
T17 = FNMS(KP951056516, T16, T15);
|
||||
T14 = W[7];
|
||||
T18 = T14 * T12;
|
||||
TZ = W[6];
|
||||
T13 = TZ * T12;
|
||||
Rp[WS(rs, 2)] = FNMS(T14, T17, T13);
|
||||
Rm[WS(rs, 2)] = FMA(TZ, T17, T18);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1K, T20, T1y, T1O, T1H, T1Z, T1W, T24, T1q, T1F, T1U;
|
||||
T1K = FMA(KP618033988, T1J, T1I);
|
||||
T20 = FNMS(KP618033988, T1I, T1J);
|
||||
T1q = FMA(KP559016994, T1p, T1o);
|
||||
T1y = FNMS(KP951056516, T1x, T1q);
|
||||
T1O = FMA(KP951056516, T1x, T1q);
|
||||
T1F = FNMS(KP250000000, T1E, T1B);
|
||||
T1H = FMA(KP559016994, T1G, T1F);
|
||||
T1Z = FNMS(KP559016994, T1G, T1F);
|
||||
T1U = FNMS(KP559016994, T1p, T1o);
|
||||
T1W = FNMS(KP951056516, T1V, T1U);
|
||||
T24 = FMA(KP951056516, T1V, T1U);
|
||||
{
|
||||
E T1L, T1A, T1M, T1f, T1z;
|
||||
T1L = FMA(KP951056516, T1K, T1H);
|
||||
T1A = W[1];
|
||||
T1M = T1A * T1y;
|
||||
T1f = W[0];
|
||||
T1z = T1f * T1y;
|
||||
Ip[0] = FNMS(T1A, T1L, T1z);
|
||||
Im[0] = FMA(T1f, T1L, T1M);
|
||||
}
|
||||
{
|
||||
E T27, T26, T28, T23, T25;
|
||||
T27 = FNMS(KP951056516, T20, T1Z);
|
||||
T26 = W[13];
|
||||
T28 = T26 * T24;
|
||||
T23 = W[12];
|
||||
T25 = T23 * T24;
|
||||
Ip[WS(rs, 3)] = FNMS(T26, T27, T25);
|
||||
Im[WS(rs, 3)] = FMA(T23, T27, T28);
|
||||
}
|
||||
{
|
||||
E T1R, T1Q, T1S, T1N, T1P;
|
||||
T1R = FNMS(KP951056516, T1K, T1H);
|
||||
T1Q = W[17];
|
||||
T1S = T1Q * T1O;
|
||||
T1N = W[16];
|
||||
T1P = T1N * T1O;
|
||||
Ip[WS(rs, 4)] = FNMS(T1Q, T1R, T1P);
|
||||
Im[WS(rs, 4)] = FMA(T1N, T1R, T1S);
|
||||
}
|
||||
{
|
||||
E T21, T1Y, T22, T1T, T1X;
|
||||
T21 = FMA(KP951056516, T20, T1Z);
|
||||
T1Y = W[5];
|
||||
T22 = T1Y * T1W;
|
||||
T1T = W[4];
|
||||
T1X = T1T * T1W;
|
||||
Ip[WS(rs, 1)] = FNMS(T1Y, T21, T1X);
|
||||
Im[WS(rs, 1)] = FMA(T1T, T21, T22);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cb_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_10, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cb_10 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 60 FP multiplications,
|
||||
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 39 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E T3, T18, TJ, T1i, TE, TF, T1B, T1A, T1f, T1t, Ti, Tl, Tt, TA, T1w;
|
||||
E T1v, T1p, T1E, TM, TO;
|
||||
{
|
||||
E T1, T2, TH, TI;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
T18 = T1 - T2;
|
||||
TH = Ip[0];
|
||||
TI = Im[WS(rs, 4)];
|
||||
TJ = TH - TI;
|
||||
T1i = TH + TI;
|
||||
}
|
||||
{
|
||||
E T6, T19, Tg, T1d, T9, T1a, Td, T1c;
|
||||
{
|
||||
E T4, T5, Te, Tf;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 2)];
|
||||
T6 = T4 + T5;
|
||||
T19 = T4 - T5;
|
||||
Te = Rm[WS(rs, 3)];
|
||||
Tf = Rp[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
T1d = Te - Tf;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tb, Tc;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
T1a = T7 - T8;
|
||||
Tb = Rp[WS(rs, 4)];
|
||||
Tc = Rm[0];
|
||||
Td = Tb + Tc;
|
||||
T1c = Tb - Tc;
|
||||
}
|
||||
TE = T6 - T9;
|
||||
TF = Td - Tg;
|
||||
T1B = T1c - T1d;
|
||||
T1A = T19 - T1a;
|
||||
{
|
||||
E T1b, T1e, Ta, Th;
|
||||
T1b = T19 + T1a;
|
||||
T1e = T1c + T1d;
|
||||
T1f = T1b + T1e;
|
||||
T1t = KP559016994 * (T1b - T1e);
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
Tl = KP559016994 * (Ta - Th);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, T1j, Tz, T1n, Ts, T1k, Tw, T1m;
|
||||
{
|
||||
E Tn, To, Tx, Ty;
|
||||
Tn = Ip[WS(rs, 2)];
|
||||
To = Im[WS(rs, 2)];
|
||||
Tp = Tn - To;
|
||||
T1j = Tn + To;
|
||||
Tx = Ip[WS(rs, 1)];
|
||||
Ty = Im[WS(rs, 3)];
|
||||
Tz = Tx - Ty;
|
||||
T1n = Tx + Ty;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, Tu, Tv;
|
||||
Tq = Ip[WS(rs, 3)];
|
||||
Tr = Im[WS(rs, 1)];
|
||||
Ts = Tq - Tr;
|
||||
T1k = Tq + Tr;
|
||||
Tu = Ip[WS(rs, 4)];
|
||||
Tv = Im[0];
|
||||
Tw = Tu - Tv;
|
||||
T1m = Tu + Tv;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
T1w = T1m + T1n;
|
||||
T1v = T1j + T1k;
|
||||
{
|
||||
E T1l, T1o, TK, TL;
|
||||
T1l = T1j - T1k;
|
||||
T1o = T1m - T1n;
|
||||
T1p = T1l + T1o;
|
||||
T1E = KP559016994 * (T1l - T1o);
|
||||
TK = Tp + Ts;
|
||||
TL = Tw + Tz;
|
||||
TM = TK + TL;
|
||||
TO = KP559016994 * (TK - TL);
|
||||
}
|
||||
}
|
||||
Rp[0] = T3 + Ti;
|
||||
Rm[0] = TJ + TM;
|
||||
{
|
||||
E T1g, T1q, T17, T1h;
|
||||
T1g = T18 + T1f;
|
||||
T1q = T1i + T1p;
|
||||
T17 = W[8];
|
||||
T1h = W[9];
|
||||
Ip[WS(rs, 2)] = FNMS(T1h, T1q, T17 * T1g);
|
||||
Im[WS(rs, 2)] = FMA(T1h, T1g, T17 * T1q);
|
||||
}
|
||||
{
|
||||
E TB, TG, T11, TX, TP, T10, Tm, TW, TN, Tk;
|
||||
TB = FNMS(KP951056516, TA, KP587785252 * Tt);
|
||||
TG = FNMS(KP951056516, TF, KP587785252 * TE);
|
||||
T11 = FMA(KP951056516, TE, KP587785252 * TF);
|
||||
TX = FMA(KP951056516, Tt, KP587785252 * TA);
|
||||
TN = FNMS(KP250000000, TM, TJ);
|
||||
TP = TN - TO;
|
||||
T10 = TO + TN;
|
||||
Tk = FNMS(KP250000000, Ti, T3);
|
||||
Tm = Tk - Tl;
|
||||
TW = Tl + Tk;
|
||||
{
|
||||
E TC, TQ, Tj, TD;
|
||||
TC = Tm - TB;
|
||||
TQ = TG + TP;
|
||||
Tj = W[2];
|
||||
TD = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(TD, TQ, Tj * TC);
|
||||
Rm[WS(rs, 1)] = FMA(TD, TC, Tj * TQ);
|
||||
}
|
||||
{
|
||||
E T14, T16, T13, T15;
|
||||
T14 = TW - TX;
|
||||
T16 = T11 + T10;
|
||||
T13 = W[10];
|
||||
T15 = W[11];
|
||||
Rp[WS(rs, 3)] = FNMS(T15, T16, T13 * T14);
|
||||
Rm[WS(rs, 3)] = FMA(T15, T14, T13 * T16);
|
||||
}
|
||||
{
|
||||
E TS, TU, TR, TT;
|
||||
TS = Tm + TB;
|
||||
TU = TP - TG;
|
||||
TR = W[14];
|
||||
TT = W[15];
|
||||
Rp[WS(rs, 4)] = FNMS(TT, TU, TR * TS);
|
||||
Rm[WS(rs, 4)] = FMA(TT, TS, TR * TU);
|
||||
}
|
||||
{
|
||||
E TY, T12, TV, TZ;
|
||||
TY = TW + TX;
|
||||
T12 = T10 - T11;
|
||||
TV = W[6];
|
||||
TZ = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(TZ, T12, TV * TY);
|
||||
Rm[WS(rs, 2)] = FMA(TZ, TY, TV * T12);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1x, T1C, T1Q, T1N, T1F, T1R, T1u, T1M, T1D, T1s;
|
||||
T1x = FNMS(KP951056516, T1w, KP587785252 * T1v);
|
||||
T1C = FNMS(KP951056516, T1B, KP587785252 * T1A);
|
||||
T1Q = FMA(KP951056516, T1A, KP587785252 * T1B);
|
||||
T1N = FMA(KP951056516, T1v, KP587785252 * T1w);
|
||||
T1D = FNMS(KP250000000, T1p, T1i);
|
||||
T1F = T1D - T1E;
|
||||
T1R = T1E + T1D;
|
||||
T1s = FNMS(KP250000000, T1f, T18);
|
||||
T1u = T1s - T1t;
|
||||
T1M = T1t + T1s;
|
||||
{
|
||||
E T1y, T1G, T1r, T1z;
|
||||
T1y = T1u - T1x;
|
||||
T1G = T1C + T1F;
|
||||
T1r = W[12];
|
||||
T1z = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T1z, T1G, T1r * T1y);
|
||||
Im[WS(rs, 3)] = FMA(T1r, T1G, T1z * T1y);
|
||||
}
|
||||
{
|
||||
E T1U, T1W, T1T, T1V;
|
||||
T1U = T1M + T1N;
|
||||
T1W = T1R - T1Q;
|
||||
T1T = W[16];
|
||||
T1V = W[17];
|
||||
Ip[WS(rs, 4)] = FNMS(T1V, T1W, T1T * T1U);
|
||||
Im[WS(rs, 4)] = FMA(T1T, T1W, T1V * T1U);
|
||||
}
|
||||
{
|
||||
E T1I, T1K, T1H, T1J;
|
||||
T1I = T1u + T1x;
|
||||
T1K = T1F - T1C;
|
||||
T1H = W[4];
|
||||
T1J = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(T1J, T1K, T1H * T1I);
|
||||
Im[WS(rs, 1)] = FMA(T1H, T1K, T1J * T1I);
|
||||
}
|
||||
{
|
||||
E T1O, T1S, T1L, T1P;
|
||||
T1O = T1M - T1N;
|
||||
T1S = T1Q + T1R;
|
||||
T1L = W[0];
|
||||
T1P = W[1];
|
||||
Ip[0] = FNMS(T1P, T1S, T1L * T1O);
|
||||
Im[0] = FMA(T1L, T1S, T1P * T1O);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cb_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_10, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
597
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_12.c
Normal file
597
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_12.c
Normal file
@@ -0,0 +1,597 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 68 FP multiplications,
|
||||
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E T18, T20, T1b, T21, T1s, T2a, T1p, T29, TI, TN, TO, Tb, To, T1f, T23;
|
||||
E T1i, T24, T1z, T2d, T1w, T2c, Tt, Ty, Tz, Tm, TD;
|
||||
{
|
||||
E T1, TE, T6, TM, T4, T1o, TH, T17, T9, T1r, TL, T1a;
|
||||
T1 = Rp[0];
|
||||
TE = Ip[0];
|
||||
T6 = Rm[WS(rs, 5)];
|
||||
TM = Im[WS(rs, 5)];
|
||||
{
|
||||
E T2, T3, TF, TG;
|
||||
T2 = Rp[WS(rs, 4)];
|
||||
T3 = Rm[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
T1o = T2 - T3;
|
||||
TF = Ip[WS(rs, 4)];
|
||||
TG = Im[WS(rs, 3)];
|
||||
TH = TF - TG;
|
||||
T17 = TF + TG;
|
||||
}
|
||||
{
|
||||
E T7, T8, TJ, TK;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
T1r = T7 - T8;
|
||||
TJ = Ip[WS(rs, 2)];
|
||||
TK = Im[WS(rs, 1)];
|
||||
TL = TJ - TK;
|
||||
T1a = TJ + TK;
|
||||
}
|
||||
{
|
||||
E T16, T19, T1q, T1n, T5, Ta;
|
||||
T16 = FNMS(KP500000000, T4, T1);
|
||||
T18 = FNMS(KP866025403, T17, T16);
|
||||
T20 = FMA(KP866025403, T17, T16);
|
||||
T19 = FNMS(KP500000000, T9, T6);
|
||||
T1b = FMA(KP866025403, T1a, T19);
|
||||
T21 = FNMS(KP866025403, T1a, T19);
|
||||
T1q = FMA(KP500000000, TL, TM);
|
||||
T1s = FNMS(KP866025403, T1r, T1q);
|
||||
T2a = FMA(KP866025403, T1r, T1q);
|
||||
T1n = FNMS(KP500000000, TH, TE);
|
||||
T1p = FMA(KP866025403, T1o, T1n);
|
||||
T29 = FNMS(KP866025403, T1o, T1n);
|
||||
TI = TE + TH;
|
||||
TN = TL - TM;
|
||||
TO = TI - TN;
|
||||
T5 = T1 + T4;
|
||||
Ta = T6 + T9;
|
||||
Tb = T5 + Ta;
|
||||
To = T5 - Ta;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tc, Tp, Th, Tx, Tf, T1v, Ts, T1e, Tk, T1y, Tw, T1h;
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Tp = Ip[WS(rs, 3)];
|
||||
Th = Rm[WS(rs, 2)];
|
||||
Tx = Im[WS(rs, 2)];
|
||||
{
|
||||
E Td, Te, Tq, Tr;
|
||||
Td = Rm[WS(rs, 4)];
|
||||
Te = Rm[0];
|
||||
Tf = Td + Te;
|
||||
T1v = Td - Te;
|
||||
Tq = Im[WS(rs, 4)];
|
||||
Tr = Im[0];
|
||||
Ts = Tq + Tr;
|
||||
T1e = Tq - Tr;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, Tu, Tv;
|
||||
Ti = Rp[WS(rs, 1)];
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
T1y = Ti - Tj;
|
||||
Tu = Ip[WS(rs, 1)];
|
||||
Tv = Ip[WS(rs, 5)];
|
||||
Tw = Tu + Tv;
|
||||
T1h = Tv - Tu;
|
||||
}
|
||||
{
|
||||
E T1d, T1g, T1x, T1u, Tg, Tl;
|
||||
T1d = FNMS(KP500000000, Tf, Tc);
|
||||
T1f = FMA(KP866025403, T1e, T1d);
|
||||
T23 = FNMS(KP866025403, T1e, T1d);
|
||||
T1g = FNMS(KP500000000, Tk, Th);
|
||||
T1i = FMA(KP866025403, T1h, T1g);
|
||||
T24 = FNMS(KP866025403, T1h, T1g);
|
||||
T1x = FMA(KP500000000, Tw, Tx);
|
||||
T1z = FNMS(KP866025403, T1y, T1x);
|
||||
T2d = FMA(KP866025403, T1y, T1x);
|
||||
T1u = FMA(KP500000000, Ts, Tp);
|
||||
T1w = FMA(KP866025403, T1v, T1u);
|
||||
T2c = FNMS(KP866025403, T1v, T1u);
|
||||
Tt = Tp - Ts;
|
||||
Ty = Tw - Tx;
|
||||
Tz = Tt - Ty;
|
||||
Tg = Tc + Tf;
|
||||
Tl = Th + Tk;
|
||||
Tm = Tg + Tl;
|
||||
TD = Tg - Tl;
|
||||
}
|
||||
}
|
||||
Rp[0] = Tb + Tm;
|
||||
{
|
||||
E TA, TP, TB, TQ, Tn, TC;
|
||||
TA = To - Tz;
|
||||
TP = TD + TO;
|
||||
Tn = W[16];
|
||||
TB = Tn * TA;
|
||||
TQ = Tn * TP;
|
||||
TC = W[17];
|
||||
Ip[WS(rs, 4)] = FNMS(TC, TP, TB);
|
||||
Im[WS(rs, 4)] = FMA(TC, TA, TQ);
|
||||
}
|
||||
{
|
||||
E TS, TV, TT, TW, TR, TU;
|
||||
TS = To + Tz;
|
||||
TV = TO - TD;
|
||||
TR = W[4];
|
||||
TT = TR * TS;
|
||||
TW = TR * TV;
|
||||
TU = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(TU, TV, TT);
|
||||
Im[WS(rs, 1)] = FMA(TU, TS, TW);
|
||||
}
|
||||
{
|
||||
E T11, T12, T13, TX, TZ, T10, T14, TY;
|
||||
T11 = TI + TN;
|
||||
T12 = Tt + Ty;
|
||||
T13 = T11 - T12;
|
||||
TY = Tb - Tm;
|
||||
TX = W[10];
|
||||
TZ = TX * TY;
|
||||
T10 = W[11];
|
||||
T14 = T10 * TY;
|
||||
Rm[0] = T11 + T12;
|
||||
Rm[WS(rs, 3)] = FMA(TX, T13, T14);
|
||||
Rp[WS(rs, 3)] = FNMS(T10, T13, TZ);
|
||||
}
|
||||
{
|
||||
E T1k, T1E, T1B, T1H;
|
||||
{
|
||||
E T1c, T1j, T1t, T1A;
|
||||
T1c = T18 + T1b;
|
||||
T1j = T1f + T1i;
|
||||
T1k = T1c - T1j;
|
||||
T1E = T1c + T1j;
|
||||
T1t = T1p - T1s;
|
||||
T1A = T1w - T1z;
|
||||
T1B = T1t - T1A;
|
||||
T1H = T1t + T1A;
|
||||
}
|
||||
{
|
||||
E T15, T1l, T1m, T1C;
|
||||
T15 = W[18];
|
||||
T1l = T15 * T1k;
|
||||
T1m = W[19];
|
||||
T1C = T1m * T1k;
|
||||
Rp[WS(rs, 5)] = FNMS(T1m, T1B, T1l);
|
||||
Rm[WS(rs, 5)] = FMA(T15, T1B, T1C);
|
||||
}
|
||||
{
|
||||
E T1D, T1F, T1G, T1I;
|
||||
T1D = W[6];
|
||||
T1F = T1D * T1E;
|
||||
T1G = W[7];
|
||||
T1I = T1G * T1E;
|
||||
Rp[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
|
||||
Rm[WS(rs, 2)] = FMA(T1D, T1H, T1I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T26, T2i, T2f, T2l;
|
||||
{
|
||||
E T22, T25, T2b, T2e;
|
||||
T22 = T20 + T21;
|
||||
T25 = T23 + T24;
|
||||
T26 = T22 - T25;
|
||||
T2i = T22 + T25;
|
||||
T2b = T29 - T2a;
|
||||
T2e = T2c - T2d;
|
||||
T2f = T2b - T2e;
|
||||
T2l = T2b + T2e;
|
||||
}
|
||||
{
|
||||
E T1Z, T27, T28, T2g;
|
||||
T1Z = W[2];
|
||||
T27 = T1Z * T26;
|
||||
T28 = W[3];
|
||||
T2g = T28 * T26;
|
||||
Rp[WS(rs, 1)] = FNMS(T28, T2f, T27);
|
||||
Rm[WS(rs, 1)] = FMA(T1Z, T2f, T2g);
|
||||
}
|
||||
{
|
||||
E T2h, T2j, T2k, T2m;
|
||||
T2h = W[14];
|
||||
T2j = T2h * T2i;
|
||||
T2k = W[15];
|
||||
T2m = T2k * T2i;
|
||||
Rp[WS(rs, 4)] = FNMS(T2k, T2l, T2j);
|
||||
Rm[WS(rs, 4)] = FMA(T2h, T2l, T2m);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2q, T2y, T2v, T2B;
|
||||
{
|
||||
E T2o, T2p, T2t, T2u;
|
||||
T2o = T20 - T21;
|
||||
T2p = T2c + T2d;
|
||||
T2q = T2o - T2p;
|
||||
T2y = T2o + T2p;
|
||||
T2t = T29 + T2a;
|
||||
T2u = T23 - T24;
|
||||
T2v = T2t + T2u;
|
||||
T2B = T2t - T2u;
|
||||
}
|
||||
{
|
||||
E T2r, T2w, T2n, T2s;
|
||||
T2n = W[8];
|
||||
T2r = T2n * T2q;
|
||||
T2w = T2n * T2v;
|
||||
T2s = W[9];
|
||||
Ip[WS(rs, 2)] = FNMS(T2s, T2v, T2r);
|
||||
Im[WS(rs, 2)] = FMA(T2s, T2q, T2w);
|
||||
}
|
||||
{
|
||||
E T2z, T2C, T2x, T2A;
|
||||
T2x = W[20];
|
||||
T2z = T2x * T2y;
|
||||
T2C = T2x * T2B;
|
||||
T2A = W[21];
|
||||
Ip[WS(rs, 5)] = FNMS(T2A, T2B, T2z);
|
||||
Im[WS(rs, 5)] = FMA(T2A, T2y, T2C);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1M, T1U, T1R, T1X;
|
||||
{
|
||||
E T1K, T1L, T1P, T1Q;
|
||||
T1K = T18 - T1b;
|
||||
T1L = T1w + T1z;
|
||||
T1M = T1K - T1L;
|
||||
T1U = T1K + T1L;
|
||||
T1P = T1p + T1s;
|
||||
T1Q = T1f - T1i;
|
||||
T1R = T1P + T1Q;
|
||||
T1X = T1P - T1Q;
|
||||
}
|
||||
{
|
||||
E T1N, T1S, T1J, T1O;
|
||||
T1J = W[0];
|
||||
T1N = T1J * T1M;
|
||||
T1S = T1J * T1R;
|
||||
T1O = W[1];
|
||||
Ip[0] = FNMS(T1O, T1R, T1N);
|
||||
Im[0] = FMA(T1O, T1M, T1S);
|
||||
}
|
||||
{
|
||||
E T1V, T1Y, T1T, T1W;
|
||||
T1T = W[12];
|
||||
T1V = T1T * T1U;
|
||||
T1Y = T1T * T1X;
|
||||
T1W = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T1W, T1X, T1V);
|
||||
Im[WS(rs, 3)] = FMA(T1W, T1U, T1Y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 60 FP multiplications,
|
||||
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 39 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E T5, TH, T12, T1M, T1i, T1U, Tl, Ty, T1c, T1Y, T1s, T1Q, Ta, TM, T15;
|
||||
E T1N, T1l, T1V, Tg, Tt, T19, T1X, T1p, T1P;
|
||||
{
|
||||
E T1, TD, T4, T1g, TG, T11, T10, T1h;
|
||||
T1 = Rp[0];
|
||||
TD = Ip[0];
|
||||
{
|
||||
E T2, T3, TE, TF;
|
||||
T2 = Rp[WS(rs, 4)];
|
||||
T3 = Rm[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
T1g = KP866025403 * (T2 - T3);
|
||||
TE = Ip[WS(rs, 4)];
|
||||
TF = Im[WS(rs, 3)];
|
||||
TG = TE - TF;
|
||||
T11 = KP866025403 * (TE + TF);
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
TH = TD + TG;
|
||||
T10 = FNMS(KP500000000, T4, T1);
|
||||
T12 = T10 - T11;
|
||||
T1M = T10 + T11;
|
||||
T1h = FNMS(KP500000000, TG, TD);
|
||||
T1i = T1g + T1h;
|
||||
T1U = T1h - T1g;
|
||||
}
|
||||
{
|
||||
E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
|
||||
Th = Rm[WS(rs, 2)];
|
||||
Tx = Im[WS(rs, 2)];
|
||||
{
|
||||
E Ti, Tj, Tu, Tv;
|
||||
Ti = Rp[WS(rs, 1)];
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
T1a = KP866025403 * (Ti - Tj);
|
||||
Tu = Ip[WS(rs, 1)];
|
||||
Tv = Ip[WS(rs, 5)];
|
||||
Tw = Tu + Tv;
|
||||
T1r = KP866025403 * (Tv - Tu);
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
Ty = Tw - Tx;
|
||||
T1b = FMA(KP500000000, Tw, Tx);
|
||||
T1c = T1a - T1b;
|
||||
T1Y = T1a + T1b;
|
||||
T1q = FNMS(KP500000000, Tk, Th);
|
||||
T1s = T1q + T1r;
|
||||
T1Q = T1q - T1r;
|
||||
}
|
||||
{
|
||||
E T6, TL, T9, T1j, TK, T14, T13, T1k;
|
||||
T6 = Rm[WS(rs, 5)];
|
||||
TL = Im[WS(rs, 5)];
|
||||
{
|
||||
E T7, T8, TI, TJ;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
T1j = KP866025403 * (T7 - T8);
|
||||
TI = Ip[WS(rs, 2)];
|
||||
TJ = Im[WS(rs, 1)];
|
||||
TK = TI - TJ;
|
||||
T14 = KP866025403 * (TI + TJ);
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TM = TK - TL;
|
||||
T13 = FNMS(KP500000000, T9, T6);
|
||||
T15 = T13 + T14;
|
||||
T1N = T13 - T14;
|
||||
T1k = FMA(KP500000000, TK, TL);
|
||||
T1l = T1j - T1k;
|
||||
T1V = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Tp = Ip[WS(rs, 3)];
|
||||
{
|
||||
E Td, Te, Tq, Tr;
|
||||
Td = Rm[WS(rs, 4)];
|
||||
Te = Rm[0];
|
||||
Tf = Td + Te;
|
||||
T17 = KP866025403 * (Td - Te);
|
||||
Tq = Im[WS(rs, 4)];
|
||||
Tr = Im[0];
|
||||
Ts = Tq + Tr;
|
||||
T1o = KP866025403 * (Tq - Tr);
|
||||
}
|
||||
Tg = Tc + Tf;
|
||||
Tt = Tp - Ts;
|
||||
T18 = FMA(KP500000000, Ts, Tp);
|
||||
T19 = T17 + T18;
|
||||
T1X = T18 - T17;
|
||||
T1n = FNMS(KP500000000, Tf, Tc);
|
||||
T1p = T1n + T1o;
|
||||
T1P = T1n - T1o;
|
||||
}
|
||||
{
|
||||
E Tb, Tm, TU, TW, TX, TY, TT, TV;
|
||||
Tb = T5 + Ta;
|
||||
Tm = Tg + Tl;
|
||||
TU = Tb - Tm;
|
||||
TW = TH + TM;
|
||||
TX = Tt + Ty;
|
||||
TY = TW - TX;
|
||||
Rp[0] = Tb + Tm;
|
||||
Rm[0] = TW + TX;
|
||||
TT = W[10];
|
||||
TV = W[11];
|
||||
Rp[WS(rs, 3)] = FNMS(TV, TY, TT * TU);
|
||||
Rm[WS(rs, 3)] = FMA(TV, TU, TT * TY);
|
||||
}
|
||||
{
|
||||
E TA, TQ, TO, TS;
|
||||
{
|
||||
E To, Tz, TC, TN;
|
||||
To = T5 - Ta;
|
||||
Tz = Tt - Ty;
|
||||
TA = To - Tz;
|
||||
TQ = To + Tz;
|
||||
TC = Tg - Tl;
|
||||
TN = TH - TM;
|
||||
TO = TC + TN;
|
||||
TS = TN - TC;
|
||||
}
|
||||
{
|
||||
E Tn, TB, TP, TR;
|
||||
Tn = W[16];
|
||||
TB = W[17];
|
||||
Ip[WS(rs, 4)] = FNMS(TB, TO, Tn * TA);
|
||||
Im[WS(rs, 4)] = FMA(Tn, TO, TB * TA);
|
||||
TP = W[4];
|
||||
TR = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
|
||||
Im[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T28, T2e, T2c, T2g;
|
||||
{
|
||||
E T26, T27, T2a, T2b;
|
||||
T26 = T1M - T1N;
|
||||
T27 = T1X + T1Y;
|
||||
T28 = T26 - T27;
|
||||
T2e = T26 + T27;
|
||||
T2a = T1U + T1V;
|
||||
T2b = T1P - T1Q;
|
||||
T2c = T2a + T2b;
|
||||
T2g = T2a - T2b;
|
||||
}
|
||||
{
|
||||
E T25, T29, T2d, T2f;
|
||||
T25 = W[8];
|
||||
T29 = W[9];
|
||||
Ip[WS(rs, 2)] = FNMS(T29, T2c, T25 * T28);
|
||||
Im[WS(rs, 2)] = FMA(T25, T2c, T29 * T28);
|
||||
T2d = W[20];
|
||||
T2f = W[21];
|
||||
Ip[WS(rs, 5)] = FNMS(T2f, T2g, T2d * T2e);
|
||||
Im[WS(rs, 5)] = FMA(T2d, T2g, T2f * T2e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1S, T22, T20, T24;
|
||||
{
|
||||
E T1O, T1R, T1W, T1Z;
|
||||
T1O = T1M + T1N;
|
||||
T1R = T1P + T1Q;
|
||||
T1S = T1O - T1R;
|
||||
T22 = T1O + T1R;
|
||||
T1W = T1U - T1V;
|
||||
T1Z = T1X - T1Y;
|
||||
T20 = T1W - T1Z;
|
||||
T24 = T1W + T1Z;
|
||||
}
|
||||
{
|
||||
E T1L, T1T, T21, T23;
|
||||
T1L = W[2];
|
||||
T1T = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(T1T, T20, T1L * T1S);
|
||||
Rm[WS(rs, 1)] = FMA(T1T, T1S, T1L * T20);
|
||||
T21 = W[14];
|
||||
T23 = W[15];
|
||||
Rp[WS(rs, 4)] = FNMS(T23, T24, T21 * T22);
|
||||
Rm[WS(rs, 4)] = FMA(T23, T22, T21 * T24);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1C, T1I, T1G, T1K;
|
||||
{
|
||||
E T1A, T1B, T1E, T1F;
|
||||
T1A = T12 + T15;
|
||||
T1B = T1p + T1s;
|
||||
T1C = T1A - T1B;
|
||||
T1I = T1A + T1B;
|
||||
T1E = T1i + T1l;
|
||||
T1F = T19 + T1c;
|
||||
T1G = T1E - T1F;
|
||||
T1K = T1E + T1F;
|
||||
}
|
||||
{
|
||||
E T1z, T1D, T1H, T1J;
|
||||
T1z = W[18];
|
||||
T1D = W[19];
|
||||
Rp[WS(rs, 5)] = FNMS(T1D, T1G, T1z * T1C);
|
||||
Rm[WS(rs, 5)] = FMA(T1D, T1C, T1z * T1G);
|
||||
T1H = W[6];
|
||||
T1J = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(T1J, T1K, T1H * T1I);
|
||||
Rm[WS(rs, 2)] = FMA(T1J, T1I, T1H * T1K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1e, T1w, T1u, T1y;
|
||||
{
|
||||
E T16, T1d, T1m, T1t;
|
||||
T16 = T12 - T15;
|
||||
T1d = T19 - T1c;
|
||||
T1e = T16 - T1d;
|
||||
T1w = T16 + T1d;
|
||||
T1m = T1i - T1l;
|
||||
T1t = T1p - T1s;
|
||||
T1u = T1m + T1t;
|
||||
T1y = T1m - T1t;
|
||||
}
|
||||
{
|
||||
E TZ, T1f, T1v, T1x;
|
||||
TZ = W[0];
|
||||
T1f = W[1];
|
||||
Ip[0] = FNMS(T1f, T1u, TZ * T1e);
|
||||
Im[0] = FMA(TZ, T1u, T1f * T1e);
|
||||
T1v = W[12];
|
||||
T1x = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T1x, T1y, T1v * T1w);
|
||||
Im[WS(rs, 3)] = FMA(T1v, T1y, T1x * T1w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
833
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_16.c
Normal file
833
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_16.c
Normal file
@@ -0,0 +1,833 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cb_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 100 FP multiplications,
|
||||
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 63 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E TA, T1O, T21, T1h, T2P, T2S, T3b, T3p, T3q, T3D, T1k, T1P, Tf, T3y, T2A;
|
||||
E T36, TL, T22, T3s, T3t, T3z, T2F, T2U, T2K, T2V, Tu, T3E, TX, T1n, T1T;
|
||||
E T24, T1W, T25, T18, T1m;
|
||||
{
|
||||
E T3, Tw, T1g, T2Q, T6, T1d, Tz, T2R, Ta, TB, TE, T2y, Td, TG, TJ;
|
||||
E T2x;
|
||||
{
|
||||
E T1, T2, T1e, T1f;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
Tw = T1 - T2;
|
||||
T1e = Ip[0];
|
||||
T1f = Im[WS(rs, 7)];
|
||||
T1g = T1e + T1f;
|
||||
T2Q = T1e - T1f;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tx, Ty;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T1d = T4 - T5;
|
||||
Tx = Ip[WS(rs, 4)];
|
||||
Ty = Im[WS(rs, 3)];
|
||||
Tz = Tx + Ty;
|
||||
T2R = Tx - Ty;
|
||||
}
|
||||
{
|
||||
E T8, T9, TC, TD;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TB = T8 - T9;
|
||||
TC = Ip[WS(rs, 2)];
|
||||
TD = Im[WS(rs, 5)];
|
||||
TE = TC + TD;
|
||||
T2y = TC - TD;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TH, TI;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
TG = Tb - Tc;
|
||||
TH = Ip[WS(rs, 6)];
|
||||
TI = Im[WS(rs, 1)];
|
||||
TJ = TH + TI;
|
||||
T2x = TH - TI;
|
||||
}
|
||||
TA = Tw - Tz;
|
||||
T1O = Tw + Tz;
|
||||
T21 = T1g - T1d;
|
||||
T1h = T1d + T1g;
|
||||
T2P = Ta - Td;
|
||||
T2S = T2Q - T2R;
|
||||
T3b = T2S - T2P;
|
||||
{
|
||||
E T1i, T1j, T7, Te;
|
||||
T3p = T2Q + T2R;
|
||||
T3q = T2y + T2x;
|
||||
T3D = T3p - T3q;
|
||||
T1i = TB + TE;
|
||||
T1j = TG + TJ;
|
||||
T1k = T1i - T1j;
|
||||
T1P = T1i + T1j;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T3y = T7 - Te;
|
||||
{
|
||||
E T2w, T2z, TF, TK;
|
||||
T2w = T3 - T6;
|
||||
T2z = T2x - T2y;
|
||||
T2A = T2w + T2z;
|
||||
T36 = T2w - T2z;
|
||||
TF = TB - TE;
|
||||
TK = TG - TJ;
|
||||
TL = TF + TK;
|
||||
T22 = TF - TK;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T13, T11, T2C, Tl, TY, T16, T2D, Tp, TS, TQ, T2H, Ts, TN, TV;
|
||||
E T2I, T2B, T2E;
|
||||
{
|
||||
E Tg, Th, TZ, T10;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T13 = Tg - Th;
|
||||
TZ = Ip[WS(rs, 1)];
|
||||
T10 = Im[WS(rs, 6)];
|
||||
T11 = TZ + T10;
|
||||
T2C = TZ - T10;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, T14, T15;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
TY = Tj - Tk;
|
||||
T14 = Ip[WS(rs, 5)];
|
||||
T15 = Im[WS(rs, 2)];
|
||||
T16 = T14 + T15;
|
||||
T2D = T14 - T15;
|
||||
}
|
||||
{
|
||||
E Tn, To, TO, TP;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
TS = Tn - To;
|
||||
TO = Ip[WS(rs, 7)];
|
||||
TP = Im[0];
|
||||
TQ = TO + TP;
|
||||
T2H = TO - TP;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, TT, TU;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TN = Tq - Tr;
|
||||
TT = Ip[WS(rs, 3)];
|
||||
TU = Im[WS(rs, 4)];
|
||||
TV = TT + TU;
|
||||
T2I = TT - TU;
|
||||
}
|
||||
T3s = T2C + T2D;
|
||||
T3t = T2H + T2I;
|
||||
T3z = T3t - T3s;
|
||||
T2B = Ti - Tl;
|
||||
T2E = T2C - T2D;
|
||||
T2F = T2B - T2E;
|
||||
T2U = T2B + T2E;
|
||||
{
|
||||
E T2G, T2J, Tm, Tt;
|
||||
T2G = Tp - Ts;
|
||||
T2J = T2H - T2I;
|
||||
T2K = T2G + T2J;
|
||||
T2V = T2J - T2G;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T3E = Tm - Tt;
|
||||
}
|
||||
{
|
||||
E TR, TW, T1R, T1S;
|
||||
TR = TN - TQ;
|
||||
TW = TS - TV;
|
||||
TX = FNMS(KP414213562, TW, TR);
|
||||
T1n = FMA(KP414213562, TR, TW);
|
||||
T1R = T11 - TY;
|
||||
T1S = T13 + T16;
|
||||
T1T = FNMS(KP414213562, T1S, T1R);
|
||||
T24 = FMA(KP414213562, T1R, T1S);
|
||||
}
|
||||
{
|
||||
E T1U, T1V, T12, T17;
|
||||
T1U = TN + TQ;
|
||||
T1V = TS + TV;
|
||||
T1W = FNMS(KP414213562, T1V, T1U);
|
||||
T25 = FMA(KP414213562, T1U, T1V);
|
||||
T12 = TY + T11;
|
||||
T17 = T13 - T16;
|
||||
T18 = FMA(KP414213562, T17, T12);
|
||||
T1m = FNMS(KP414213562, T12, T17);
|
||||
}
|
||||
}
|
||||
Rp[0] = Tf + Tu;
|
||||
{
|
||||
E T3r, T3u, T3v, T3l, T3n, T3o, T3w, T3m;
|
||||
T3r = T3p + T3q;
|
||||
T3u = T3s + T3t;
|
||||
T3v = T3r - T3u;
|
||||
T3m = Tf - Tu;
|
||||
T3l = W[14];
|
||||
T3n = T3l * T3m;
|
||||
T3o = W[15];
|
||||
T3w = T3o * T3m;
|
||||
Rm[0] = T3r + T3u;
|
||||
Rm[WS(rs, 4)] = FMA(T3l, T3v, T3w);
|
||||
Rp[WS(rs, 4)] = FNMS(T3o, T3v, T3n);
|
||||
}
|
||||
{
|
||||
E T3A, T3F, T3B, T3G, T3x, T3C;
|
||||
T3A = T3y - T3z;
|
||||
T3F = T3D - T3E;
|
||||
T3x = W[22];
|
||||
T3B = T3x * T3A;
|
||||
T3G = T3x * T3F;
|
||||
T3C = W[23];
|
||||
Rp[WS(rs, 6)] = FNMS(T3C, T3F, T3B);
|
||||
Rm[WS(rs, 6)] = FMA(T3C, T3A, T3G);
|
||||
}
|
||||
{
|
||||
E T3I, T3L, T3J, T3M, T3H, T3K;
|
||||
T3I = T3y + T3z;
|
||||
T3L = T3E + T3D;
|
||||
T3H = W[6];
|
||||
T3J = T3H * T3I;
|
||||
T3M = T3H * T3L;
|
||||
T3K = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(T3K, T3L, T3J);
|
||||
Rm[WS(rs, 2)] = FMA(T3K, T3I, T3M);
|
||||
}
|
||||
{
|
||||
E T38, T3g, T3d, T3j, T37, T3c;
|
||||
T37 = T2V - T2U;
|
||||
T38 = FNMS(KP707106781, T37, T36);
|
||||
T3g = FMA(KP707106781, T37, T36);
|
||||
T3c = T2F - T2K;
|
||||
T3d = FNMS(KP707106781, T3c, T3b);
|
||||
T3j = FMA(KP707106781, T3c, T3b);
|
||||
{
|
||||
E T39, T3e, T35, T3a;
|
||||
T35 = W[26];
|
||||
T39 = T35 * T38;
|
||||
T3e = T35 * T3d;
|
||||
T3a = W[27];
|
||||
Rp[WS(rs, 7)] = FNMS(T3a, T3d, T39);
|
||||
Rm[WS(rs, 7)] = FMA(T3a, T38, T3e);
|
||||
}
|
||||
{
|
||||
E T3h, T3k, T3f, T3i;
|
||||
T3f = W[10];
|
||||
T3h = T3f * T3g;
|
||||
T3k = T3f * T3j;
|
||||
T3i = W[11];
|
||||
Rp[WS(rs, 3)] = FNMS(T3i, T3j, T3h);
|
||||
Rm[WS(rs, 3)] = FMA(T3i, T3g, T3k);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2M, T30, T2X, T33, T2L, T2T, T2W;
|
||||
T2L = T2F + T2K;
|
||||
T2M = FNMS(KP707106781, T2L, T2A);
|
||||
T30 = FMA(KP707106781, T2L, T2A);
|
||||
T2T = T2P + T2S;
|
||||
T2W = T2U + T2V;
|
||||
T2X = FNMS(KP707106781, T2W, T2T);
|
||||
T33 = FMA(KP707106781, T2W, T2T);
|
||||
{
|
||||
E T2v, T2N, T2O, T2Y;
|
||||
T2v = W[18];
|
||||
T2N = T2v * T2M;
|
||||
T2O = W[19];
|
||||
T2Y = T2O * T2M;
|
||||
Rp[WS(rs, 5)] = FNMS(T2O, T2X, T2N);
|
||||
Rm[WS(rs, 5)] = FMA(T2v, T2X, T2Y);
|
||||
}
|
||||
{
|
||||
E T2Z, T31, T32, T34;
|
||||
T2Z = W[2];
|
||||
T31 = T2Z * T30;
|
||||
T32 = W[3];
|
||||
T34 = T32 * T30;
|
||||
Rp[WS(rs, 1)] = FNMS(T32, T33, T31);
|
||||
Rm[WS(rs, 1)] = FMA(T2Z, T33, T34);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1Y, T2a, T27, T2d;
|
||||
{
|
||||
E T1Q, T1X, T23, T26;
|
||||
T1Q = FNMS(KP707106781, T1P, T1O);
|
||||
T1X = T1T + T1W;
|
||||
T1Y = FMA(KP923879532, T1X, T1Q);
|
||||
T2a = FNMS(KP923879532, T1X, T1Q);
|
||||
T23 = FMA(KP707106781, T22, T21);
|
||||
T26 = T24 - T25;
|
||||
T27 = FNMS(KP923879532, T26, T23);
|
||||
T2d = FMA(KP923879532, T26, T23);
|
||||
}
|
||||
{
|
||||
E T1N, T1Z, T20, T28;
|
||||
T1N = W[20];
|
||||
T1Z = T1N * T1Y;
|
||||
T20 = W[21];
|
||||
T28 = T20 * T1Y;
|
||||
Ip[WS(rs, 5)] = FNMS(T20, T27, T1Z);
|
||||
Im[WS(rs, 5)] = FMA(T1N, T27, T28);
|
||||
}
|
||||
{
|
||||
E T29, T2b, T2c, T2e;
|
||||
T29 = W[4];
|
||||
T2b = T29 * T2a;
|
||||
T2c = W[5];
|
||||
T2e = T2c * T2a;
|
||||
Ip[WS(rs, 1)] = FNMS(T2c, T2d, T2b);
|
||||
Im[WS(rs, 1)] = FMA(T29, T2d, T2e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1a, T1s, T1p, T1v;
|
||||
{
|
||||
E TM, T19, T1l, T1o;
|
||||
TM = FNMS(KP707106781, TL, TA);
|
||||
T19 = TX - T18;
|
||||
T1a = FNMS(KP923879532, T19, TM);
|
||||
T1s = FMA(KP923879532, T19, TM);
|
||||
T1l = FNMS(KP707106781, T1k, T1h);
|
||||
T1o = T1m - T1n;
|
||||
T1p = FNMS(KP923879532, T1o, T1l);
|
||||
T1v = FMA(KP923879532, T1o, T1l);
|
||||
}
|
||||
{
|
||||
E Tv, T1b, T1c, T1q;
|
||||
Tv = W[24];
|
||||
T1b = Tv * T1a;
|
||||
T1c = W[25];
|
||||
T1q = T1c * T1a;
|
||||
Ip[WS(rs, 6)] = FNMS(T1c, T1p, T1b);
|
||||
Im[WS(rs, 6)] = FMA(Tv, T1p, T1q);
|
||||
}
|
||||
{
|
||||
E T1r, T1t, T1u, T1w;
|
||||
T1r = W[8];
|
||||
T1t = T1r * T1s;
|
||||
T1u = W[9];
|
||||
T1w = T1u * T1s;
|
||||
Ip[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
|
||||
Im[WS(rs, 2)] = FMA(T1r, T1v, T1w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2i, T2q, T2n, T2t;
|
||||
{
|
||||
E T2g, T2h, T2l, T2m;
|
||||
T2g = FMA(KP707106781, T1P, T1O);
|
||||
T2h = T24 + T25;
|
||||
T2i = FNMS(KP923879532, T2h, T2g);
|
||||
T2q = FMA(KP923879532, T2h, T2g);
|
||||
T2l = FNMS(KP707106781, T22, T21);
|
||||
T2m = T1W - T1T;
|
||||
T2n = FMA(KP923879532, T2m, T2l);
|
||||
T2t = FNMS(KP923879532, T2m, T2l);
|
||||
}
|
||||
{
|
||||
E T2j, T2o, T2f, T2k;
|
||||
T2f = W[12];
|
||||
T2j = T2f * T2i;
|
||||
T2o = T2f * T2n;
|
||||
T2k = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T2k, T2n, T2j);
|
||||
Im[WS(rs, 3)] = FMA(T2k, T2i, T2o);
|
||||
}
|
||||
{
|
||||
E T2r, T2u, T2p, T2s;
|
||||
T2p = W[28];
|
||||
T2r = T2p * T2q;
|
||||
T2u = T2p * T2t;
|
||||
T2s = W[29];
|
||||
Ip[WS(rs, 7)] = FNMS(T2s, T2t, T2r);
|
||||
Im[WS(rs, 7)] = FMA(T2s, T2q, T2u);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1A, T1I, T1F, T1L;
|
||||
{
|
||||
E T1y, T1z, T1D, T1E;
|
||||
T1y = FMA(KP707106781, TL, TA);
|
||||
T1z = T1m + T1n;
|
||||
T1A = FNMS(KP923879532, T1z, T1y);
|
||||
T1I = FMA(KP923879532, T1z, T1y);
|
||||
T1D = FMA(KP707106781, T1k, T1h);
|
||||
T1E = T18 + TX;
|
||||
T1F = FNMS(KP923879532, T1E, T1D);
|
||||
T1L = FMA(KP923879532, T1E, T1D);
|
||||
}
|
||||
{
|
||||
E T1B, T1G, T1x, T1C;
|
||||
T1x = W[16];
|
||||
T1B = T1x * T1A;
|
||||
T1G = T1x * T1F;
|
||||
T1C = W[17];
|
||||
Ip[WS(rs, 4)] = FNMS(T1C, T1F, T1B);
|
||||
Im[WS(rs, 4)] = FMA(T1C, T1A, T1G);
|
||||
}
|
||||
{
|
||||
E T1J, T1M, T1H, T1K;
|
||||
T1H = W[0];
|
||||
T1J = T1H * T1I;
|
||||
T1M = T1H * T1L;
|
||||
T1K = W[1];
|
||||
Ip[0] = FNMS(T1K, T1L, T1J);
|
||||
Im[0] = FMA(T1K, T1I, T1M);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cb_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cb_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 84 FP multiplications,
|
||||
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 50 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
|
||||
E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
|
||||
E T2S, T30;
|
||||
{
|
||||
E T3, T1Q, T13, T2j, T6, T2i, T16, T1R;
|
||||
{
|
||||
E T1, T2, T11, T12;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T1Q = T1 - T2;
|
||||
T11 = Ip[0];
|
||||
T12 = Im[WS(rs, 7)];
|
||||
T13 = T11 - T12;
|
||||
T2j = T11 + T12;
|
||||
}
|
||||
{
|
||||
E T4, T5, T14, T15;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T2i = T4 - T5;
|
||||
T14 = Ip[WS(rs, 4)];
|
||||
T15 = Im[WS(rs, 3)];
|
||||
T16 = T14 - T15;
|
||||
T1R = T14 + T15;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T2K = T1Q + T1R;
|
||||
T2W = T2j - T2i;
|
||||
Tw = T3 - T6;
|
||||
T17 = T13 - T16;
|
||||
T1S = T1Q - T1R;
|
||||
T2k = T2i + T2j;
|
||||
T1w = T13 + T16;
|
||||
}
|
||||
{
|
||||
E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
|
||||
{
|
||||
E T8, T9, TA, TB;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
T1T = T8 - T9;
|
||||
TA = Ip[WS(rs, 2)];
|
||||
TB = Im[WS(rs, 5)];
|
||||
TC = TA - TB;
|
||||
T1U = TA + TB;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tx, Ty;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
T1W = Tb - Tc;
|
||||
Tx = Ip[WS(rs, 6)];
|
||||
Ty = Im[WS(rs, 1)];
|
||||
Tz = Tx - Ty;
|
||||
T1X = Tx + Ty;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TD = Tz - TC;
|
||||
T1x = TC + Tz;
|
||||
T10 = Ta - Td;
|
||||
{
|
||||
E T2l, T2m, T1V, T1Y;
|
||||
T2l = T1T + T1U;
|
||||
T2m = T1W + T1X;
|
||||
T2n = KP707106781 * (T2l - T2m);
|
||||
T2L = KP707106781 * (T2l + T2m);
|
||||
T1V = T1T - T1U;
|
||||
T1Y = T1W - T1X;
|
||||
T1Z = KP707106781 * (T1V + T1Y);
|
||||
T2X = KP707106781 * (T1V - T1Y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T2b, TI, T29, Tl, T28, TL, T2c, TF, TM;
|
||||
{
|
||||
E Tg, Th, TG, TH;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
T2b = Tg - Th;
|
||||
TG = Ip[WS(rs, 1)];
|
||||
TH = Im[WS(rs, 6)];
|
||||
TI = TG - TH;
|
||||
T29 = TG + TH;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TJ, TK;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
T28 = Tj - Tk;
|
||||
TJ = Ip[WS(rs, 5)];
|
||||
TK = Im[WS(rs, 2)];
|
||||
TL = TJ - TK;
|
||||
T2c = TJ + TK;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1z = TI + TL;
|
||||
TF = Ti - Tl;
|
||||
TM = TI - TL;
|
||||
TN = TF - TM;
|
||||
T19 = TF + TM;
|
||||
{
|
||||
E T2a, T2d, T2N, T2O;
|
||||
T2a = T28 + T29;
|
||||
T2d = T2b - T2c;
|
||||
T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
|
||||
T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
|
||||
T2N = T2b + T2c;
|
||||
T2O = T29 - T28;
|
||||
T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
|
||||
T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, T24, TR, T22, Ts, T21, TU, T25, TO, TV;
|
||||
{
|
||||
E Tn, To, TP, TQ;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T24 = Tn - To;
|
||||
TP = Ip[WS(rs, 7)];
|
||||
TQ = Im[0];
|
||||
TR = TP - TQ;
|
||||
T22 = TP + TQ;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, TS, TT;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
T21 = Tq - Tr;
|
||||
TS = Ip[WS(rs, 3)];
|
||||
TT = Im[WS(rs, 4)];
|
||||
TU = TS - TT;
|
||||
T25 = TS + TT;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T1A = TR + TU;
|
||||
TO = Tp - Ts;
|
||||
TV = TR - TU;
|
||||
TW = TO + TV;
|
||||
T1a = TV - TO;
|
||||
{
|
||||
E T23, T26, T2Q, T2R;
|
||||
T23 = T21 - T22;
|
||||
T26 = T24 - T25;
|
||||
T27 = FNMS(KP382683432, T26, KP923879532 * T23);
|
||||
T2q = FMA(KP382683432, T23, KP923879532 * T26);
|
||||
T2Q = T24 + T25;
|
||||
T2R = T21 + T22;
|
||||
T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
|
||||
T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
T1u = Tf - Tu;
|
||||
T1y = T1w + T1x;
|
||||
T1B = T1z + T1A;
|
||||
T1C = T1y - T1B;
|
||||
Rp[0] = Tf + Tu;
|
||||
Rm[0] = T1y + T1B;
|
||||
T1t = W[14];
|
||||
T1v = W[15];
|
||||
Rp[WS(rs, 4)] = FNMS(T1v, T1C, T1t * T1u);
|
||||
Rm[WS(rs, 4)] = FMA(T1v, T1u, T1t * T1C);
|
||||
}
|
||||
{
|
||||
E T2U, T34, T32, T36;
|
||||
{
|
||||
E T2M, T2T, T2Y, T31;
|
||||
T2M = T2K - T2L;
|
||||
T2T = T2P + T2S;
|
||||
T2U = T2M - T2T;
|
||||
T34 = T2M + T2T;
|
||||
T2Y = T2W + T2X;
|
||||
T31 = T2Z - T30;
|
||||
T32 = T2Y - T31;
|
||||
T36 = T2Y + T31;
|
||||
}
|
||||
{
|
||||
E T2J, T2V, T33, T35;
|
||||
T2J = W[20];
|
||||
T2V = W[21];
|
||||
Ip[WS(rs, 5)] = FNMS(T2V, T32, T2J * T2U);
|
||||
Im[WS(rs, 5)] = FMA(T2V, T2U, T2J * T32);
|
||||
T33 = W[4];
|
||||
T35 = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(T35, T36, T33 * T34);
|
||||
Im[WS(rs, 1)] = FMA(T35, T34, T33 * T36);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3a, T3g, T3e, T3i;
|
||||
{
|
||||
E T38, T39, T3c, T3d;
|
||||
T38 = T2K + T2L;
|
||||
T39 = T2Z + T30;
|
||||
T3a = T38 - T39;
|
||||
T3g = T38 + T39;
|
||||
T3c = T2W - T2X;
|
||||
T3d = T2P - T2S;
|
||||
T3e = T3c + T3d;
|
||||
T3i = T3c - T3d;
|
||||
}
|
||||
{
|
||||
E T37, T3b, T3f, T3h;
|
||||
T37 = W[12];
|
||||
T3b = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T3b, T3e, T37 * T3a);
|
||||
Im[WS(rs, 3)] = FMA(T37, T3e, T3b * T3a);
|
||||
T3f = W[28];
|
||||
T3h = W[29];
|
||||
Ip[WS(rs, 7)] = FNMS(T3h, T3i, T3f * T3g);
|
||||
Im[WS(rs, 7)] = FMA(T3f, T3i, T3h * T3g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TY, T1e, T1c, T1g;
|
||||
{
|
||||
E TE, TX, T18, T1b;
|
||||
TE = Tw + TD;
|
||||
TX = KP707106781 * (TN + TW);
|
||||
TY = TE - TX;
|
||||
T1e = TE + TX;
|
||||
T18 = T10 + T17;
|
||||
T1b = KP707106781 * (T19 + T1a);
|
||||
T1c = T18 - T1b;
|
||||
T1g = T18 + T1b;
|
||||
}
|
||||
{
|
||||
E Tv, TZ, T1d, T1f;
|
||||
Tv = W[18];
|
||||
TZ = W[19];
|
||||
Rp[WS(rs, 5)] = FNMS(TZ, T1c, Tv * TY);
|
||||
Rm[WS(rs, 5)] = FMA(TZ, TY, Tv * T1c);
|
||||
T1d = W[2];
|
||||
T1f = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(T1f, T1g, T1d * T1e);
|
||||
Rm[WS(rs, 1)] = FMA(T1f, T1e, T1d * T1g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1k, T1q, T1o, T1s;
|
||||
{
|
||||
E T1i, T1j, T1m, T1n;
|
||||
T1i = Tw - TD;
|
||||
T1j = KP707106781 * (T1a - T19);
|
||||
T1k = T1i - T1j;
|
||||
T1q = T1i + T1j;
|
||||
T1m = T17 - T10;
|
||||
T1n = KP707106781 * (TN - TW);
|
||||
T1o = T1m - T1n;
|
||||
T1s = T1m + T1n;
|
||||
}
|
||||
{
|
||||
E T1h, T1l, T1p, T1r;
|
||||
T1h = W[26];
|
||||
T1l = W[27];
|
||||
Rp[WS(rs, 7)] = FNMS(T1l, T1o, T1h * T1k);
|
||||
Rm[WS(rs, 7)] = FMA(T1h, T1o, T1l * T1k);
|
||||
T1p = W[10];
|
||||
T1r = W[11];
|
||||
Rp[WS(rs, 3)] = FNMS(T1r, T1s, T1p * T1q);
|
||||
Rm[WS(rs, 3)] = FMA(T1p, T1s, T1r * T1q);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2g, T2u, T2s, T2w;
|
||||
{
|
||||
E T20, T2f, T2o, T2r;
|
||||
T20 = T1S - T1Z;
|
||||
T2f = T27 - T2e;
|
||||
T2g = T20 - T2f;
|
||||
T2u = T20 + T2f;
|
||||
T2o = T2k - T2n;
|
||||
T2r = T2p - T2q;
|
||||
T2s = T2o - T2r;
|
||||
T2w = T2o + T2r;
|
||||
}
|
||||
{
|
||||
E T1P, T2h, T2t, T2v;
|
||||
T1P = W[24];
|
||||
T2h = W[25];
|
||||
Ip[WS(rs, 6)] = FNMS(T2h, T2s, T1P * T2g);
|
||||
Im[WS(rs, 6)] = FMA(T2h, T2g, T1P * T2s);
|
||||
T2t = W[8];
|
||||
T2v = W[9];
|
||||
Ip[WS(rs, 2)] = FNMS(T2v, T2w, T2t * T2u);
|
||||
Im[WS(rs, 2)] = FMA(T2v, T2u, T2t * T2w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2A, T2G, T2E, T2I;
|
||||
{
|
||||
E T2y, T2z, T2C, T2D;
|
||||
T2y = T1S + T1Z;
|
||||
T2z = T2p + T2q;
|
||||
T2A = T2y - T2z;
|
||||
T2G = T2y + T2z;
|
||||
T2C = T2k + T2n;
|
||||
T2D = T2e + T27;
|
||||
T2E = T2C - T2D;
|
||||
T2I = T2C + T2D;
|
||||
}
|
||||
{
|
||||
E T2x, T2B, T2F, T2H;
|
||||
T2x = W[16];
|
||||
T2B = W[17];
|
||||
Ip[WS(rs, 4)] = FNMS(T2B, T2E, T2x * T2A);
|
||||
Im[WS(rs, 4)] = FMA(T2x, T2E, T2B * T2A);
|
||||
T2F = W[0];
|
||||
T2H = W[1];
|
||||
Ip[0] = FNMS(T2H, T2I, T2F * T2G);
|
||||
Im[0] = FMA(T2F, T2I, T2H * T2G);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1G, T1M, T1K, T1O;
|
||||
{
|
||||
E T1E, T1F, T1I, T1J;
|
||||
T1E = T7 - Te;
|
||||
T1F = T1A - T1z;
|
||||
T1G = T1E - T1F;
|
||||
T1M = T1E + T1F;
|
||||
T1I = T1w - T1x;
|
||||
T1J = Tm - Tt;
|
||||
T1K = T1I - T1J;
|
||||
T1O = T1J + T1I;
|
||||
}
|
||||
{
|
||||
E T1D, T1H, T1L, T1N;
|
||||
T1D = W[22];
|
||||
T1H = W[23];
|
||||
Rp[WS(rs, 6)] = FNMS(T1H, T1K, T1D * T1G);
|
||||
Rm[WS(rs, 6)] = FMA(T1D, T1K, T1H * T1G);
|
||||
T1L = W[6];
|
||||
T1N = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(T1N, T1O, T1L * T1M);
|
||||
Rm[WS(rs, 2)] = FMA(T1L, T1O, T1N * T1M);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cb_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
117
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_2.c
Normal file
117
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_2.c
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cb_2 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 11 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, T2, T6, T3, T4, T9;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[0];
|
||||
T6 = T1 - T2;
|
||||
T3 = Ip[0];
|
||||
T4 = Im[0];
|
||||
T9 = T3 + T4;
|
||||
Rp[0] = T1 + T2;
|
||||
Rm[0] = T3 - T4;
|
||||
{
|
||||
E T5, T7, T8, Ta;
|
||||
T5 = W[0];
|
||||
T7 = T5 * T6;
|
||||
T8 = W[1];
|
||||
Ta = T8 * T6;
|
||||
Ip[0] = FNMS(T8, T9, T7);
|
||||
Im[0] = FMA(T5, T9, Ta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_2, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cb_2 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, T2, T6, T3, T4, T8, T5, T7;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[0];
|
||||
T6 = T1 - T2;
|
||||
T3 = Ip[0];
|
||||
T4 = Im[0];
|
||||
T8 = T3 + T4;
|
||||
Rp[0] = T1 + T2;
|
||||
Rm[0] = T3 - T4;
|
||||
T5 = W[0];
|
||||
T7 = W[1];
|
||||
Ip[0] = FNMS(T7, T8, T5 * T6);
|
||||
Im[0] = FMA(T7, T6, T5 * T8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_2, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
1064
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_20.c
Normal file
1064
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1843
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_32.c
Normal file
1843
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_32.c
Normal file
File diff suppressed because it is too large
Load Diff
196
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_4.c
Normal file
196
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_4.c
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cb_4 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 22 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T3, T6, T8, Td, Tx, Tu, Tm, Tg, Tr;
|
||||
{
|
||||
E Tb, Tc, Tq, Tk, Te, Tf, Tl, Tp;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
Tb = Ip[0];
|
||||
Tc = Im[WS(rs, 1)];
|
||||
Tq = Tb + Tc;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tk = T1 - T2;
|
||||
Te = Ip[WS(rs, 1)];
|
||||
Tf = Im[0];
|
||||
Tl = Te + Tf;
|
||||
T4 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
Tp = T4 - T5;
|
||||
}
|
||||
T8 = T3 - T6;
|
||||
Td = Tb - Tc;
|
||||
Tx = Tq - Tp;
|
||||
Tu = Tk + Tl;
|
||||
Tm = Tk - Tl;
|
||||
Tg = Te - Tf;
|
||||
Tr = Tp + Tq;
|
||||
}
|
||||
Rp[0] = T3 + T6;
|
||||
Rm[0] = Td + Tg;
|
||||
{
|
||||
E Tn, Ts, Tj, To;
|
||||
Tj = W[0];
|
||||
Tn = Tj * Tm;
|
||||
Ts = Tj * Tr;
|
||||
To = W[1];
|
||||
Ip[0] = FNMS(To, Tr, Tn);
|
||||
Im[0] = FMA(To, Tm, Ts);
|
||||
}
|
||||
{
|
||||
E Tv, Ty, Tt, Tw;
|
||||
Tt = W[4];
|
||||
Tv = Tt * Tu;
|
||||
Ty = Tt * Tx;
|
||||
Tw = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(Tw, Tx, Tv);
|
||||
Im[WS(rs, 1)] = FMA(Tw, Tu, Ty);
|
||||
}
|
||||
{
|
||||
E Th, Ta, Ti, T7, T9;
|
||||
Th = Td - Tg;
|
||||
Ta = W[3];
|
||||
Ti = Ta * T8;
|
||||
T7 = W[2];
|
||||
T9 = T7 * T8;
|
||||
Rp[WS(rs, 1)] = FNMS(Ta, Th, T9);
|
||||
Rm[WS(rs, 1)] = FMA(T7, Th, Ti);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cb_4 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T3, Ti, Tc, Tn, T6, Tm, Tf, Tj;
|
||||
{
|
||||
E T1, T2, Ta, Tb;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Ti = T1 - T2;
|
||||
Ta = Ip[0];
|
||||
Tb = Im[WS(rs, 1)];
|
||||
Tc = Ta - Tb;
|
||||
Tn = Ta + Tb;
|
||||
}
|
||||
{
|
||||
E T4, T5, Td, Te;
|
||||
T4 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
Tm = T4 - T5;
|
||||
Td = Ip[WS(rs, 1)];
|
||||
Te = Im[0];
|
||||
Tf = Td - Te;
|
||||
Tj = Td + Te;
|
||||
}
|
||||
Rp[0] = T3 + T6;
|
||||
Rm[0] = Tc + Tf;
|
||||
{
|
||||
E T8, Tg, T7, T9;
|
||||
T8 = T3 - T6;
|
||||
Tg = Tc - Tf;
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(T9, Tg, T7 * T8);
|
||||
Rm[WS(rs, 1)] = FMA(T9, T8, T7 * Tg);
|
||||
}
|
||||
{
|
||||
E Tk, To, Th, Tl;
|
||||
Tk = Ti - Tj;
|
||||
To = Tm + Tn;
|
||||
Th = W[0];
|
||||
Tl = W[1];
|
||||
Ip[0] = FNMS(Tl, To, Th * Tk);
|
||||
Im[0] = FMA(Th, To, Tl * Tk);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = Ti + Tj;
|
||||
Ts = Tn - Tm;
|
||||
Tp = W[4];
|
||||
Tr = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(Tr, Ts, Tp * Tq);
|
||||
Im[WS(rs, 1)] = FMA(Tp, Ts, Tr * Tq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
292
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_6.c
Normal file
292
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_6.c
Normal file
@@ -0,0 +1,292 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cb_6 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 32 FP multiplications,
|
||||
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E Td, Tn, TO, TJ, TN, Tk, Tr, T3, TC, Ts, TQ, Ta, Tm, TF, TG;
|
||||
{
|
||||
E Tb, Tc, Tj, TI, Tg, TH;
|
||||
Tb = Ip[0];
|
||||
Tc = Im[WS(rs, 2)];
|
||||
Td = Tb - Tc;
|
||||
{
|
||||
E Th, Ti, Te, Tf;
|
||||
Th = Ip[WS(rs, 1)];
|
||||
Ti = Im[WS(rs, 1)];
|
||||
Tj = Th - Ti;
|
||||
TI = Th + Ti;
|
||||
Te = Ip[WS(rs, 2)];
|
||||
Tf = Im[0];
|
||||
Tg = Te - Tf;
|
||||
TH = Te + Tf;
|
||||
}
|
||||
Tn = Tj - Tg;
|
||||
TO = TH - TI;
|
||||
TJ = TH + TI;
|
||||
TN = Tb + Tc;
|
||||
Tk = Tg + Tj;
|
||||
Tr = FNMS(KP500000000, Tk, Td);
|
||||
}
|
||||
{
|
||||
E T9, TE, T6, TD, T1, T2;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
TC = T1 - T2;
|
||||
{
|
||||
E T7, T8, T4, T5;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TE = T7 - T8;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
TD = T4 - T5;
|
||||
}
|
||||
Ts = T6 - T9;
|
||||
TQ = TD - TE;
|
||||
Ta = T6 + T9;
|
||||
Tm = FNMS(KP500000000, Ta, T3);
|
||||
TF = TD + TE;
|
||||
TG = FNMS(KP500000000, TF, TC);
|
||||
}
|
||||
Rp[0] = T3 + Ta;
|
||||
Rm[0] = Td + Tk;
|
||||
{
|
||||
E To, Tt, Tp, Tu, Tl, Tq;
|
||||
To = FNMS(KP866025403, Tn, Tm);
|
||||
Tt = FNMS(KP866025403, Ts, Tr);
|
||||
Tl = W[2];
|
||||
Tp = Tl * To;
|
||||
Tu = Tl * Tt;
|
||||
Tq = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(Tq, Tt, Tp);
|
||||
Rm[WS(rs, 1)] = FMA(Tq, To, Tu);
|
||||
}
|
||||
{
|
||||
E T13, TZ, T11, T12, T14, T10;
|
||||
T13 = TN + TO;
|
||||
T10 = TC + TF;
|
||||
TZ = W[4];
|
||||
T11 = TZ * T10;
|
||||
T12 = W[5];
|
||||
T14 = T12 * T10;
|
||||
Ip[WS(rs, 1)] = FNMS(T12, T13, T11);
|
||||
Im[WS(rs, 1)] = FMA(TZ, T13, T14);
|
||||
}
|
||||
{
|
||||
E Tw, Tz, Tx, TA, Tv, Ty;
|
||||
Tw = FMA(KP866025403, Tn, Tm);
|
||||
Tz = FMA(KP866025403, Ts, Tr);
|
||||
Tv = W[6];
|
||||
Tx = Tv * Tw;
|
||||
TA = Tv * Tz;
|
||||
Ty = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(Ty, Tz, Tx);
|
||||
Rm[WS(rs, 2)] = FMA(Ty, Tw, TA);
|
||||
}
|
||||
{
|
||||
E TR, TX, TT, TV, TW, TY, TB, TL, TM, TS, TP, TU, TK;
|
||||
TP = FNMS(KP500000000, TO, TN);
|
||||
TR = FMA(KP866025403, TQ, TP);
|
||||
TX = FNMS(KP866025403, TQ, TP);
|
||||
TU = FMA(KP866025403, TJ, TG);
|
||||
TT = W[8];
|
||||
TV = TT * TU;
|
||||
TW = W[9];
|
||||
TY = TW * TU;
|
||||
TK = FNMS(KP866025403, TJ, TG);
|
||||
TB = W[0];
|
||||
TL = TB * TK;
|
||||
TM = W[1];
|
||||
TS = TM * TK;
|
||||
Ip[0] = FNMS(TM, TR, TL);
|
||||
Im[0] = FMA(TB, TR, TS);
|
||||
Ip[WS(rs, 2)] = FNMS(TW, TX, TV);
|
||||
Im[WS(rs, 2)] = FMA(TT, TX, TY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, "hc2cb_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_6, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cb_6 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 28 FP multiplications,
|
||||
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 25 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T3, Ty, Td, TE, Ta, TO, Tr, TB, Tk, TL, Tn, TH;
|
||||
{
|
||||
E T1, T2, Tb, Tc;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
Ty = T1 - T2;
|
||||
Tb = Ip[0];
|
||||
Tc = Im[WS(rs, 2)];
|
||||
Td = Tb - Tc;
|
||||
TE = Tb + Tc;
|
||||
}
|
||||
{
|
||||
E T6, Tz, T9, TA;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
Tz = T4 - T5;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TA = T7 - T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TO = KP866025403 * (Tz - TA);
|
||||
Tr = KP866025403 * (T6 - T9);
|
||||
TB = Tz + TA;
|
||||
}
|
||||
{
|
||||
E Tg, TG, Tj, TF;
|
||||
{
|
||||
E Te, Tf, Th, Ti;
|
||||
Te = Ip[WS(rs, 2)];
|
||||
Tf = Im[0];
|
||||
Tg = Te - Tf;
|
||||
TG = Te + Tf;
|
||||
Th = Ip[WS(rs, 1)];
|
||||
Ti = Im[WS(rs, 1)];
|
||||
Tj = Th - Ti;
|
||||
TF = Th + Ti;
|
||||
}
|
||||
Tk = Tg + Tj;
|
||||
TL = KP866025403 * (TG + TF);
|
||||
Tn = KP866025403 * (Tj - Tg);
|
||||
TH = TF - TG;
|
||||
}
|
||||
Rp[0] = T3 + Ta;
|
||||
Rm[0] = Td + Tk;
|
||||
{
|
||||
E TC, TI, Tx, TD;
|
||||
TC = Ty + TB;
|
||||
TI = TE - TH;
|
||||
Tx = W[4];
|
||||
TD = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(TD, TI, Tx * TC);
|
||||
Im[WS(rs, 1)] = FMA(TD, TC, Tx * TI);
|
||||
}
|
||||
{
|
||||
E To, Tu, Ts, Tw, Tm, Tq;
|
||||
Tm = FNMS(KP500000000, Ta, T3);
|
||||
To = Tm - Tn;
|
||||
Tu = Tm + Tn;
|
||||
Tq = FNMS(KP500000000, Tk, Td);
|
||||
Ts = Tq - Tr;
|
||||
Tw = Tr + Tq;
|
||||
{
|
||||
E Tl, Tp, Tt, Tv;
|
||||
Tl = W[2];
|
||||
Tp = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(Tp, Ts, Tl * To);
|
||||
Rm[WS(rs, 1)] = FMA(Tl, Ts, Tp * To);
|
||||
Tt = W[6];
|
||||
Tv = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(Tv, Tw, Tt * Tu);
|
||||
Rm[WS(rs, 2)] = FMA(Tt, Tw, Tv * Tu);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, TS, TQ, TU, TK, TP;
|
||||
TK = FNMS(KP500000000, TB, Ty);
|
||||
TM = TK - TL;
|
||||
TS = TK + TL;
|
||||
TP = FMA(KP500000000, TH, TE);
|
||||
TQ = TO + TP;
|
||||
TU = TP - TO;
|
||||
{
|
||||
E TJ, TN, TR, TT;
|
||||
TJ = W[0];
|
||||
TN = W[1];
|
||||
Ip[0] = FNMS(TN, TQ, TJ * TM);
|
||||
Im[0] = FMA(TN, TM, TJ * TQ);
|
||||
TR = W[8];
|
||||
TT = W[9];
|
||||
Ip[WS(rs, 2)] = FNMS(TT, TU, TR * TS);
|
||||
Im[WS(rs, 2)] = FMA(TT, TS, TR * TU);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, "hc2cb_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_6, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
373
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_8.c
Normal file
373
fftw-3.3.10/rdft/scalar/r2cb/hc2cb_8.c
Normal file
@@ -0,0 +1,373 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 33 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T1i, T1n, Tk, TD, TV, T1b, TQ, Te, T1e, T1o, T1j, TE, TF, TR;
|
||||
E Tv, TW;
|
||||
{
|
||||
E T3, Tg, TC, T19, T6, Tz, Tj, T1a;
|
||||
{
|
||||
E T1, T2, TA, TB;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
Tg = T1 - T2;
|
||||
TA = Ip[0];
|
||||
TB = Im[WS(rs, 3)];
|
||||
TC = TA + TB;
|
||||
T19 = TA - TB;
|
||||
}
|
||||
{
|
||||
E T4, T5, Th, Ti;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
Tz = T4 - T5;
|
||||
Th = Ip[WS(rs, 2)];
|
||||
Ti = Im[WS(rs, 1)];
|
||||
Tj = Th + Ti;
|
||||
T1a = Th - Ti;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1i = T3 - T6;
|
||||
T1n = T19 - T1a;
|
||||
Tk = Tg - Tj;
|
||||
TD = Tz + TC;
|
||||
TV = TC - Tz;
|
||||
T1b = T19 + T1a;
|
||||
TQ = Tg + Tj;
|
||||
}
|
||||
{
|
||||
E Ta, Tl, To, T1c, Td, Tq, Tt, T1d, Tp, Tu;
|
||||
{
|
||||
E T8, T9, Tm, Tn;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tl = T8 - T9;
|
||||
Tm = Ip[WS(rs, 1)];
|
||||
Tn = Im[WS(rs, 2)];
|
||||
To = Tm + Tn;
|
||||
T1c = Tm - Tn;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tr, Ts;
|
||||
Tb = Rm[0];
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
Tq = Tb - Tc;
|
||||
Tr = Ip[WS(rs, 3)];
|
||||
Ts = Im[0];
|
||||
Tt = Tr + Ts;
|
||||
T1d = Tr - Ts;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1e = T1c + T1d;
|
||||
T1o = Ta - Td;
|
||||
T1j = T1d - T1c;
|
||||
TE = Tl + To;
|
||||
TF = Tq + Tt;
|
||||
TR = TE + TF;
|
||||
Tp = Tl - To;
|
||||
Tu = Tq - Tt;
|
||||
Tv = Tp + Tu;
|
||||
TW = Tp - Tu;
|
||||
}
|
||||
Rp[0] = T7 + Te;
|
||||
Rm[0] = T1b + T1e;
|
||||
{
|
||||
E TS, TX, TT, TY, TP, TU;
|
||||
TS = FNMS(KP707106781, TR, TQ);
|
||||
TX = FMA(KP707106781, TW, TV);
|
||||
TP = W[4];
|
||||
TT = TP * TS;
|
||||
TY = TP * TX;
|
||||
TU = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(TU, TX, TT);
|
||||
Im[WS(rs, 1)] = FMA(TU, TS, TY);
|
||||
}
|
||||
{
|
||||
E T1s, T1v, T1t, T1w, T1r, T1u;
|
||||
T1s = T1i + T1j;
|
||||
T1v = T1o + T1n;
|
||||
T1r = W[2];
|
||||
T1t = T1r * T1s;
|
||||
T1w = T1r * T1v;
|
||||
T1u = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(T1u, T1v, T1t);
|
||||
Rm[WS(rs, 1)] = FMA(T1u, T1s, T1w);
|
||||
}
|
||||
{
|
||||
E T10, T13, T11, T14, TZ, T12;
|
||||
T10 = FMA(KP707106781, TR, TQ);
|
||||
T13 = FNMS(KP707106781, TW, TV);
|
||||
TZ = W[12];
|
||||
T11 = TZ * T10;
|
||||
T14 = TZ * T13;
|
||||
T12 = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T12, T13, T11);
|
||||
Im[WS(rs, 3)] = FMA(T12, T10, T14);
|
||||
}
|
||||
{
|
||||
E T1f, T15, T17, T18, T1g, T16;
|
||||
T1f = T1b - T1e;
|
||||
T16 = T7 - Te;
|
||||
T15 = W[6];
|
||||
T17 = T15 * T16;
|
||||
T18 = W[7];
|
||||
T1g = T18 * T16;
|
||||
Rp[WS(rs, 2)] = FNMS(T18, T1f, T17);
|
||||
Rm[WS(rs, 2)] = FMA(T15, T1f, T1g);
|
||||
}
|
||||
{
|
||||
E T1k, T1p, T1l, T1q, T1h, T1m;
|
||||
T1k = T1i - T1j;
|
||||
T1p = T1n - T1o;
|
||||
T1h = W[10];
|
||||
T1l = T1h * T1k;
|
||||
T1q = T1h * T1p;
|
||||
T1m = W[11];
|
||||
Rp[WS(rs, 3)] = FNMS(T1m, T1p, T1l);
|
||||
Rm[WS(rs, 3)] = FMA(T1m, T1k, T1q);
|
||||
}
|
||||
{
|
||||
E TH, TN, TJ, TL, TM, TO, Tf, Tx, Ty, TI, TG, TK, Tw;
|
||||
TG = TE - TF;
|
||||
TH = FNMS(KP707106781, TG, TD);
|
||||
TN = FMA(KP707106781, TG, TD);
|
||||
TK = FMA(KP707106781, Tv, Tk);
|
||||
TJ = W[0];
|
||||
TL = TJ * TK;
|
||||
TM = W[1];
|
||||
TO = TM * TK;
|
||||
Tw = FNMS(KP707106781, Tv, Tk);
|
||||
Tf = W[8];
|
||||
Tx = Tf * Tw;
|
||||
Ty = W[9];
|
||||
TI = Ty * Tw;
|
||||
Ip[WS(rs, 2)] = FNMS(Ty, TH, Tx);
|
||||
Im[WS(rs, 2)] = FMA(Tf, TH, TI);
|
||||
Ip[0] = FNMS(TM, TN, TL);
|
||||
Im[0] = FMA(TJ, TN, TO);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 32 FP multiplications,
|
||||
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 30 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
|
||||
E TD;
|
||||
{
|
||||
E T3, TK, Tk, TX, T6, TW, Tn, TL;
|
||||
{
|
||||
E T1, T2, Ti, Tj;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
TK = T1 - T2;
|
||||
Ti = Ip[0];
|
||||
Tj = Im[WS(rs, 3)];
|
||||
Tk = Ti - Tj;
|
||||
TX = Ti + Tj;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tl, Tm;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
TW = T4 - T5;
|
||||
Tl = Ip[WS(rs, 2)];
|
||||
Tm = Im[WS(rs, 1)];
|
||||
Tn = Tl - Tm;
|
||||
TL = Tl + Tm;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T18 = TK + TL;
|
||||
T1c = TX - TW;
|
||||
To = Tk + Tn;
|
||||
Ty = T3 - T6;
|
||||
TM = TK - TL;
|
||||
TY = TW + TX;
|
||||
TC = Tk - Tn;
|
||||
}
|
||||
{
|
||||
E Ta, TN, Tr, TO, Td, TQ, Tu, TR;
|
||||
{
|
||||
E T8, T9, Tp, Tq;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
TN = T8 - T9;
|
||||
Tp = Ip[WS(rs, 1)];
|
||||
Tq = Im[WS(rs, 2)];
|
||||
Tr = Tp - Tq;
|
||||
TO = Tp + Tq;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Ts, Tt;
|
||||
Tb = Rm[0];
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
TQ = Tb - Tc;
|
||||
Ts = Ip[WS(rs, 3)];
|
||||
Tt = Im[0];
|
||||
Tu = Ts - Tt;
|
||||
TR = Ts + Tt;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TZ = TN + TO;
|
||||
T10 = TQ + TR;
|
||||
Tv = Tr + Tu;
|
||||
Tz = Tu - Tr;
|
||||
TP = TN - TO;
|
||||
TS = TQ - TR;
|
||||
TD = Ta - Td;
|
||||
}
|
||||
Rp[0] = T7 + Te;
|
||||
Rm[0] = To + Tv;
|
||||
{
|
||||
E Tg, Tw, Tf, Th;
|
||||
Tg = T7 - Te;
|
||||
Tw = To - Tv;
|
||||
Tf = W[6];
|
||||
Th = W[7];
|
||||
Rp[WS(rs, 2)] = FNMS(Th, Tw, Tf * Tg);
|
||||
Rm[WS(rs, 2)] = FMA(Th, Tg, Tf * Tw);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = Ty + Tz;
|
||||
TI = TD + TC;
|
||||
TF = W[2];
|
||||
TH = W[3];
|
||||
Rp[WS(rs, 1)] = FNMS(TH, TI, TF * TG);
|
||||
Rm[WS(rs, 1)] = FMA(TF, TI, TH * TG);
|
||||
}
|
||||
{
|
||||
E TA, TE, Tx, TB;
|
||||
TA = Ty - Tz;
|
||||
TE = TC - TD;
|
||||
Tx = W[10];
|
||||
TB = W[11];
|
||||
Rp[WS(rs, 3)] = FNMS(TB, TE, Tx * TA);
|
||||
Rm[WS(rs, 3)] = FMA(Tx, TE, TB * TA);
|
||||
}
|
||||
{
|
||||
E T1a, T1g, T1e, T1i, T19, T1d;
|
||||
T19 = KP707106781 * (TZ + T10);
|
||||
T1a = T18 - T19;
|
||||
T1g = T18 + T19;
|
||||
T1d = KP707106781 * (TP - TS);
|
||||
T1e = T1c + T1d;
|
||||
T1i = T1c - T1d;
|
||||
{
|
||||
E T17, T1b, T1f, T1h;
|
||||
T17 = W[4];
|
||||
T1b = W[5];
|
||||
Ip[WS(rs, 1)] = FNMS(T1b, T1e, T17 * T1a);
|
||||
Im[WS(rs, 1)] = FMA(T17, T1e, T1b * T1a);
|
||||
T1f = W[12];
|
||||
T1h = W[13];
|
||||
Ip[WS(rs, 3)] = FNMS(T1h, T1i, T1f * T1g);
|
||||
Im[WS(rs, 3)] = FMA(T1f, T1i, T1h * T1g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TU, T14, T12, T16, TT, T11;
|
||||
TT = KP707106781 * (TP + TS);
|
||||
TU = TM - TT;
|
||||
T14 = TM + TT;
|
||||
T11 = KP707106781 * (TZ - T10);
|
||||
T12 = TY - T11;
|
||||
T16 = TY + T11;
|
||||
{
|
||||
E TJ, TV, T13, T15;
|
||||
TJ = W[8];
|
||||
TV = W[9];
|
||||
Ip[WS(rs, 2)] = FNMS(TV, T12, TJ * TU);
|
||||
Im[WS(rs, 2)] = FMA(TV, TU, TJ * T12);
|
||||
T13 = W[0];
|
||||
T15 = W[1];
|
||||
Ip[0] = FNMS(T15, T16, T13 * T14);
|
||||
Im[0] = FMA(T15, T14, T13 * T16);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cb_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
892
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_16.c
Normal file
892
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_16.c
Normal file
@@ -0,0 +1,892 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:14 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft2_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 206 FP additions, 100 FP multiplications,
|
||||
* (or, 136 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 66 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E Tf, T20, T32, T3Q, T3f, T3V, TN, T2a, T1m, T2f, T2G, T3G, T2T, T3L, T1F;
|
||||
E T26, T2J, T2M, T2N, T2U, T2V, T3H, Tu, T25, T3i, T3R, T1a, T2g, T1y, T21;
|
||||
E T39, T3W, T1p, T2b;
|
||||
{
|
||||
E T3, T1e, TA, T1C, T6, Tx, T1h, T1D, Td, T1A, TL, T1k, Ta, T1z, TG;
|
||||
E T1j;
|
||||
{
|
||||
E T1, T2, T1f, T1g;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T1e = T1 - T2;
|
||||
{
|
||||
E Ty, Tz, T4, T5;
|
||||
Ty = Ip[0];
|
||||
Tz = Im[WS(rs, 7)];
|
||||
TA = Ty + Tz;
|
||||
T1C = Ty - Tz;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
Tx = T4 - T5;
|
||||
}
|
||||
T1f = Ip[WS(rs, 4)];
|
||||
T1g = Im[WS(rs, 3)];
|
||||
T1h = T1f + T1g;
|
||||
T1D = T1f - T1g;
|
||||
{
|
||||
E Tb, Tc, TH, TI, TJ, TK;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
TH = Tb - Tc;
|
||||
TI = Im[WS(rs, 1)];
|
||||
TJ = Ip[WS(rs, 6)];
|
||||
TK = TI + TJ;
|
||||
Td = Tb + Tc;
|
||||
T1A = TJ - TI;
|
||||
TL = TH + TK;
|
||||
T1k = TH - TK;
|
||||
}
|
||||
{
|
||||
E T8, T9, TC, TD, TE, TF;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
TC = T8 - T9;
|
||||
TD = Ip[WS(rs, 2)];
|
||||
TE = Im[WS(rs, 5)];
|
||||
TF = TD + TE;
|
||||
Ta = T8 + T9;
|
||||
T1z = TD - TE;
|
||||
TG = TC + TF;
|
||||
T1j = TC - TF;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Te, T30, T31;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T20 = T7 - Te;
|
||||
T30 = TA - Tx;
|
||||
T31 = T1j - T1k;
|
||||
T32 = FMA(KP707106781, T31, T30);
|
||||
T3Q = FNMS(KP707106781, T31, T30);
|
||||
}
|
||||
{
|
||||
E T3d, T3e, TB, TM;
|
||||
T3d = T1e + T1h;
|
||||
T3e = TG + TL;
|
||||
T3f = FNMS(KP707106781, T3e, T3d);
|
||||
T3V = FMA(KP707106781, T3e, T3d);
|
||||
TB = Tx + TA;
|
||||
TM = TG - TL;
|
||||
TN = FMA(KP707106781, TM, TB);
|
||||
T2a = FNMS(KP707106781, TM, TB);
|
||||
}
|
||||
{
|
||||
E T1i, T1l, T2E, T2F;
|
||||
T1i = T1e - T1h;
|
||||
T1l = T1j + T1k;
|
||||
T1m = FMA(KP707106781, T1l, T1i);
|
||||
T2f = FNMS(KP707106781, T1l, T1i);
|
||||
T2E = T3 - T6;
|
||||
T2F = T1A - T1z;
|
||||
T2G = T2E + T2F;
|
||||
T3G = T2E - T2F;
|
||||
}
|
||||
{
|
||||
E T2R, T2S, T1B, T1E;
|
||||
T2R = Ta - Td;
|
||||
T2S = T1C - T1D;
|
||||
T2T = T2R + T2S;
|
||||
T3L = T2S - T2R;
|
||||
T1B = T1z + T1A;
|
||||
T1E = T1C + T1D;
|
||||
T1F = T1B + T1E;
|
||||
T26 = T1E - T1B;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T1s, Tl, T1t, TS, TX, T34, T33, T2I, T2H, Tp, T1v, Ts, T1w, T13;
|
||||
E T18, T37, T36, T2L, T2K;
|
||||
{
|
||||
E TT, TR, TO, TW;
|
||||
{
|
||||
E Tg, Th, TP, TQ;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
TT = Tg - Th;
|
||||
TP = Ip[WS(rs, 1)];
|
||||
TQ = Im[WS(rs, 6)];
|
||||
TR = TP + TQ;
|
||||
T1s = TP - TQ;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TU, TV;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
TO = Tj - Tk;
|
||||
TU = Ip[WS(rs, 5)];
|
||||
TV = Im[WS(rs, 2)];
|
||||
TW = TU + TV;
|
||||
T1t = TU - TV;
|
||||
}
|
||||
TS = TO + TR;
|
||||
TX = TT - TW;
|
||||
T34 = TR - TO;
|
||||
T33 = TT + TW;
|
||||
T2I = T1s - T1t;
|
||||
T2H = Ti - Tl;
|
||||
}
|
||||
{
|
||||
E T14, T12, TZ, T17;
|
||||
{
|
||||
E Tn, To, T10, T11;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T14 = Tn - To;
|
||||
T10 = Im[0];
|
||||
T11 = Ip[WS(rs, 7)];
|
||||
T12 = T10 + T11;
|
||||
T1v = T11 - T10;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T15, T16;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TZ = Tq - Tr;
|
||||
T15 = Ip[WS(rs, 3)];
|
||||
T16 = Im[WS(rs, 4)];
|
||||
T17 = T15 + T16;
|
||||
T1w = T15 - T16;
|
||||
}
|
||||
T13 = TZ - T12;
|
||||
T18 = T14 - T17;
|
||||
T37 = TZ + T12;
|
||||
T36 = T14 + T17;
|
||||
T2L = T1v - T1w;
|
||||
T2K = Tp - Ts;
|
||||
}
|
||||
T2J = T2H - T2I;
|
||||
T2M = T2K + T2L;
|
||||
T2N = T2J + T2M;
|
||||
T2U = T2H + T2I;
|
||||
T2V = T2L - T2K;
|
||||
T3H = T2V - T2U;
|
||||
{
|
||||
E Tm, Tt, T3g, T3h;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T25 = Tm - Tt;
|
||||
T3g = FNMS(KP414213562, T33, T34);
|
||||
T3h = FNMS(KP414213562, T36, T37);
|
||||
T3i = T3g + T3h;
|
||||
T3R = T3h - T3g;
|
||||
}
|
||||
{
|
||||
E TY, T19, T1u, T1x;
|
||||
TY = FMA(KP414213562, TX, TS);
|
||||
T19 = FNMS(KP414213562, T18, T13);
|
||||
T1a = TY + T19;
|
||||
T2g = T19 - TY;
|
||||
T1u = T1s + T1t;
|
||||
T1x = T1v + T1w;
|
||||
T1y = T1u + T1x;
|
||||
T21 = T1x - T1u;
|
||||
}
|
||||
{
|
||||
E T35, T38, T1n, T1o;
|
||||
T35 = FMA(KP414213562, T34, T33);
|
||||
T38 = FMA(KP414213562, T37, T36);
|
||||
T39 = T35 - T38;
|
||||
T3W = T35 + T38;
|
||||
T1n = FNMS(KP414213562, TS, TX);
|
||||
T1o = FMA(KP414213562, T13, T18);
|
||||
T1p = T1n + T1o;
|
||||
T2b = T1n - T1o;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tv, T1G, T1b, T1q, T1c, T1H, Tw, T1r, T1I, T1d;
|
||||
Tv = Tf + Tu;
|
||||
T1G = T1y + T1F;
|
||||
T1b = FMA(KP923879532, T1a, TN);
|
||||
T1q = FMA(KP923879532, T1p, T1m);
|
||||
Tw = W[0];
|
||||
T1c = Tw * T1b;
|
||||
T1H = Tw * T1q;
|
||||
T1d = W[1];
|
||||
T1r = FMA(T1d, T1q, T1c);
|
||||
T1I = FNMS(T1d, T1b, T1H);
|
||||
Rp[0] = Tv - T1r;
|
||||
Ip[0] = T1G + T1I;
|
||||
Rm[0] = Tv + T1r;
|
||||
Im[0] = T1I - T1G;
|
||||
}
|
||||
{
|
||||
E T1N, T1J, T1L, T1M, T1V, T1Q, T1T, T1R, T1X, T1K, T1P;
|
||||
T1N = T1F - T1y;
|
||||
T1K = Tf - Tu;
|
||||
T1J = W[14];
|
||||
T1L = T1J * T1K;
|
||||
T1M = W[15];
|
||||
T1V = T1M * T1K;
|
||||
T1Q = FNMS(KP923879532, T1a, TN);
|
||||
T1T = FNMS(KP923879532, T1p, T1m);
|
||||
T1P = W[16];
|
||||
T1R = T1P * T1Q;
|
||||
T1X = T1P * T1T;
|
||||
{
|
||||
E T1O, T1W, T1U, T1Y, T1S;
|
||||
T1O = FNMS(T1M, T1N, T1L);
|
||||
T1W = FMA(T1J, T1N, T1V);
|
||||
T1S = W[17];
|
||||
T1U = FMA(T1S, T1T, T1R);
|
||||
T1Y = FNMS(T1S, T1Q, T1X);
|
||||
Rp[WS(rs, 4)] = T1O - T1U;
|
||||
Ip[WS(rs, 4)] = T1W + T1Y;
|
||||
Rm[WS(rs, 4)] = T1O + T1U;
|
||||
Im[WS(rs, 4)] = T1Y - T1W;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2r, T2n, T2p, T2q, T2z, T2u, T2x, T2v, T2B, T2o, T2t;
|
||||
T2r = T26 - T25;
|
||||
T2o = T20 - T21;
|
||||
T2n = W[22];
|
||||
T2p = T2n * T2o;
|
||||
T2q = W[23];
|
||||
T2z = T2q * T2o;
|
||||
T2u = FNMS(KP923879532, T2b, T2a);
|
||||
T2x = FNMS(KP923879532, T2g, T2f);
|
||||
T2t = W[24];
|
||||
T2v = T2t * T2u;
|
||||
T2B = T2t * T2x;
|
||||
{
|
||||
E T2s, T2A, T2y, T2C, T2w;
|
||||
T2s = FNMS(T2q, T2r, T2p);
|
||||
T2A = FMA(T2n, T2r, T2z);
|
||||
T2w = W[25];
|
||||
T2y = FMA(T2w, T2x, T2v);
|
||||
T2C = FNMS(T2w, T2u, T2B);
|
||||
Rp[WS(rs, 6)] = T2s - T2y;
|
||||
Ip[WS(rs, 6)] = T2A + T2C;
|
||||
Rm[WS(rs, 6)] = T2s + T2y;
|
||||
Im[WS(rs, 6)] = T2C - T2A;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T27, T1Z, T23, T24, T2j, T2c, T2h, T2d, T2l, T22, T29;
|
||||
T27 = T25 + T26;
|
||||
T22 = T20 + T21;
|
||||
T1Z = W[6];
|
||||
T23 = T1Z * T22;
|
||||
T24 = W[7];
|
||||
T2j = T24 * T22;
|
||||
T2c = FMA(KP923879532, T2b, T2a);
|
||||
T2h = FMA(KP923879532, T2g, T2f);
|
||||
T29 = W[8];
|
||||
T2d = T29 * T2c;
|
||||
T2l = T29 * T2h;
|
||||
{
|
||||
E T28, T2k, T2i, T2m, T2e;
|
||||
T28 = FNMS(T24, T27, T23);
|
||||
T2k = FMA(T1Z, T27, T2j);
|
||||
T2e = W[9];
|
||||
T2i = FMA(T2e, T2h, T2d);
|
||||
T2m = FNMS(T2e, T2c, T2l);
|
||||
Rp[WS(rs, 2)] = T28 - T2i;
|
||||
Ip[WS(rs, 2)] = T2k + T2m;
|
||||
Rm[WS(rs, 2)] = T28 + T2i;
|
||||
Im[WS(rs, 2)] = T2m - T2k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3N, T47, T43, T45, T46, T4f, T3F, T3J, T3K, T3Z, T3S, T3X, T3T, T41, T4a;
|
||||
E T4d, T4b, T4h;
|
||||
{
|
||||
E T3M, T44, T3I, T3P, T49;
|
||||
T3M = T2J - T2M;
|
||||
T3N = FMA(KP707106781, T3M, T3L);
|
||||
T47 = FNMS(KP707106781, T3M, T3L);
|
||||
T44 = FNMS(KP707106781, T3H, T3G);
|
||||
T43 = W[26];
|
||||
T45 = T43 * T44;
|
||||
T46 = W[27];
|
||||
T4f = T46 * T44;
|
||||
T3I = FMA(KP707106781, T3H, T3G);
|
||||
T3F = W[10];
|
||||
T3J = T3F * T3I;
|
||||
T3K = W[11];
|
||||
T3Z = T3K * T3I;
|
||||
T3S = FMA(KP923879532, T3R, T3Q);
|
||||
T3X = FNMS(KP923879532, T3W, T3V);
|
||||
T3P = W[12];
|
||||
T3T = T3P * T3S;
|
||||
T41 = T3P * T3X;
|
||||
T4a = FNMS(KP923879532, T3R, T3Q);
|
||||
T4d = FMA(KP923879532, T3W, T3V);
|
||||
T49 = W[28];
|
||||
T4b = T49 * T4a;
|
||||
T4h = T49 * T4d;
|
||||
}
|
||||
{
|
||||
E T3O, T40, T3Y, T42, T3U;
|
||||
T3O = FNMS(T3K, T3N, T3J);
|
||||
T40 = FMA(T3F, T3N, T3Z);
|
||||
T3U = W[13];
|
||||
T3Y = FMA(T3U, T3X, T3T);
|
||||
T42 = FNMS(T3U, T3S, T41);
|
||||
Rp[WS(rs, 3)] = T3O - T3Y;
|
||||
Ip[WS(rs, 3)] = T40 + T42;
|
||||
Rm[WS(rs, 3)] = T3O + T3Y;
|
||||
Im[WS(rs, 3)] = T42 - T40;
|
||||
}
|
||||
{
|
||||
E T48, T4g, T4e, T4i, T4c;
|
||||
T48 = FNMS(T46, T47, T45);
|
||||
T4g = FMA(T43, T47, T4f);
|
||||
T4c = W[29];
|
||||
T4e = FMA(T4c, T4d, T4b);
|
||||
T4i = FNMS(T4c, T4a, T4h);
|
||||
Rp[WS(rs, 7)] = T48 - T4e;
|
||||
Ip[WS(rs, 7)] = T4g + T4i;
|
||||
Rm[WS(rs, 7)] = T48 + T4e;
|
||||
Im[WS(rs, 7)] = T4i - T4g;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2X, T3t, T3p, T3r, T3s, T3B, T2D, T2P, T2Q, T3l, T3a, T3j, T3b, T3n, T3w;
|
||||
E T3z, T3x, T3D;
|
||||
{
|
||||
E T2W, T3q, T2O, T2Z, T3v;
|
||||
T2W = T2U + T2V;
|
||||
T2X = FMA(KP707106781, T2W, T2T);
|
||||
T3t = FNMS(KP707106781, T2W, T2T);
|
||||
T3q = FNMS(KP707106781, T2N, T2G);
|
||||
T3p = W[18];
|
||||
T3r = T3p * T3q;
|
||||
T3s = W[19];
|
||||
T3B = T3s * T3q;
|
||||
T2O = FMA(KP707106781, T2N, T2G);
|
||||
T2D = W[2];
|
||||
T2P = T2D * T2O;
|
||||
T2Q = W[3];
|
||||
T3l = T2Q * T2O;
|
||||
T3a = FMA(KP923879532, T39, T32);
|
||||
T3j = FNMS(KP923879532, T3i, T3f);
|
||||
T2Z = W[4];
|
||||
T3b = T2Z * T3a;
|
||||
T3n = T2Z * T3j;
|
||||
T3w = FNMS(KP923879532, T39, T32);
|
||||
T3z = FMA(KP923879532, T3i, T3f);
|
||||
T3v = W[20];
|
||||
T3x = T3v * T3w;
|
||||
T3D = T3v * T3z;
|
||||
}
|
||||
{
|
||||
E T2Y, T3m, T3k, T3o, T3c;
|
||||
T2Y = FNMS(T2Q, T2X, T2P);
|
||||
T3m = FMA(T2D, T2X, T3l);
|
||||
T3c = W[5];
|
||||
T3k = FMA(T3c, T3j, T3b);
|
||||
T3o = FNMS(T3c, T3a, T3n);
|
||||
Rp[WS(rs, 1)] = T2Y - T3k;
|
||||
Ip[WS(rs, 1)] = T3m + T3o;
|
||||
Rm[WS(rs, 1)] = T2Y + T3k;
|
||||
Im[WS(rs, 1)] = T3o - T3m;
|
||||
}
|
||||
{
|
||||
E T3u, T3C, T3A, T3E, T3y;
|
||||
T3u = FNMS(T3s, T3t, T3r);
|
||||
T3C = FMA(T3p, T3t, T3B);
|
||||
T3y = W[21];
|
||||
T3A = FMA(T3y, T3z, T3x);
|
||||
T3E = FNMS(T3y, T3w, T3D);
|
||||
Rp[WS(rs, 5)] = T3u - T3A;
|
||||
Ip[WS(rs, 5)] = T3C + T3E;
|
||||
Rm[WS(rs, 5)] = T3u + T3A;
|
||||
Im[WS(rs, 5)] = T3E - T3C;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cbdft2_16", twinstr, &GENUS, { 136, 30, 70, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft2_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft2_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 206 FP additions, 84 FP multiplications,
|
||||
* (or, 168 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 60 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
|
||||
E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
|
||||
E T1x, T1V;
|
||||
{
|
||||
E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
|
||||
E T1z;
|
||||
{
|
||||
E T1, T2, Ty, Tz;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T1j = T1 - T2;
|
||||
Ty = Ip[0];
|
||||
Tz = Im[WS(rs, 7)];
|
||||
TA = Ty + Tz;
|
||||
T1B = Ty - Tz;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1k, T1l;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
Tx = T4 - T5;
|
||||
T1k = Ip[WS(rs, 4)];
|
||||
T1l = Im[WS(rs, 3)];
|
||||
T1m = T1k + T1l;
|
||||
T1C = T1k - T1l;
|
||||
}
|
||||
{
|
||||
E T8, T9, TD, TE;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TC = T8 - T9;
|
||||
TD = Ip[WS(rs, 2)];
|
||||
TE = Im[WS(rs, 5)];
|
||||
TF = TD + TE;
|
||||
T1y = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TI, TJ;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
TH = Tb - Tc;
|
||||
TI = Im[WS(rs, 1)];
|
||||
TJ = Ip[WS(rs, 6)];
|
||||
TK = TI + TJ;
|
||||
T1z = TJ - TI;
|
||||
}
|
||||
{
|
||||
E T7, Te, TG, TL;
|
||||
TB = Tx + TA;
|
||||
T2L = TA - Tx;
|
||||
T30 = T1j + T1m;
|
||||
T1n = T1j - T1m;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T1U = T7 - Te;
|
||||
{
|
||||
E T2F, T2G, T1A, T1D;
|
||||
T2F = Ta - Td;
|
||||
T2G = T1B - T1C;
|
||||
T2H = T2F + T2G;
|
||||
T3p = T2G - T2F;
|
||||
T1A = T1y + T1z;
|
||||
T1D = T1B + T1C;
|
||||
T1E = T1A + T1D;
|
||||
T1Z = T1D - T1A;
|
||||
}
|
||||
TG = TC + TF;
|
||||
TL = TH + TK;
|
||||
TM = KP707106781 * (TG - TL);
|
||||
T31 = KP707106781 * (TG + TL);
|
||||
{
|
||||
E T2q, T2r, T1g, T1h;
|
||||
T2q = T3 - T6;
|
||||
T2r = T1z - T1y;
|
||||
T2s = T2q + T2r;
|
||||
T3k = T2q - T2r;
|
||||
T1g = TC - TF;
|
||||
T1h = TH - TK;
|
||||
T1i = KP707106781 * (T1g + T1h);
|
||||
T2M = KP707106781 * (T1g - T1h);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
|
||||
E T1v;
|
||||
{
|
||||
E Tg, Th, TP, TQ;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
TT = Tg - Th;
|
||||
TP = Ip[WS(rs, 1)];
|
||||
TQ = Im[WS(rs, 6)];
|
||||
TR = TP + TQ;
|
||||
T1r = TP - TQ;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TU, TV;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
TO = Tj - Tk;
|
||||
TU = Ip[WS(rs, 5)];
|
||||
TV = Im[WS(rs, 2)];
|
||||
TW = TU + TV;
|
||||
T1s = TU - TV;
|
||||
}
|
||||
{
|
||||
E Tn, To, T10, T11;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T14 = Tn - To;
|
||||
T10 = Im[0];
|
||||
T11 = Ip[WS(rs, 7)];
|
||||
T12 = T10 + T11;
|
||||
T1u = T11 - T10;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T15, T16;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TZ = Tq - Tr;
|
||||
T15 = Ip[WS(rs, 3)];
|
||||
T16 = Im[WS(rs, 4)];
|
||||
T17 = T15 + T16;
|
||||
T1v = T15 - T16;
|
||||
}
|
||||
{
|
||||
E Tm, Tt, T2O, T2P;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T1Y = Tm - Tt;
|
||||
T2O = TR - TO;
|
||||
T2P = TT + TW;
|
||||
T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
|
||||
T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
|
||||
}
|
||||
{
|
||||
E T2R, T2S, TS, TX;
|
||||
T2R = TZ + T12;
|
||||
T2S = T14 + T17;
|
||||
T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
|
||||
T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
|
||||
TS = TO + TR;
|
||||
TX = TT - TW;
|
||||
TY = FMA(KP923879532, TS, KP382683432 * TX);
|
||||
T1d = FNMS(KP382683432, TS, KP923879532 * TX);
|
||||
}
|
||||
{
|
||||
E T13, T18, T2t, T2u;
|
||||
T13 = TZ - T12;
|
||||
T18 = T14 - T17;
|
||||
T19 = FNMS(KP382683432, T18, KP923879532 * T13);
|
||||
T1e = FMA(KP382683432, T13, KP923879532 * T18);
|
||||
T2t = Ti - Tl;
|
||||
T2u = T1r - T1s;
|
||||
T2v = T2t - T2u;
|
||||
T2C = T2t + T2u;
|
||||
}
|
||||
{
|
||||
E T2w, T2x, T1t, T1w;
|
||||
T2w = Tp - Ts;
|
||||
T2x = T1u - T1v;
|
||||
T2y = T2w + T2x;
|
||||
T2D = T2x - T2w;
|
||||
T1t = T1r + T1s;
|
||||
T1w = T1u + T1v;
|
||||
T1x = T1t + T1w;
|
||||
T1V = T1w - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
|
||||
Tv = Tf + Tu;
|
||||
T1F = T1x + T1E;
|
||||
{
|
||||
E TN, T1a, T1f, T1o;
|
||||
TN = TB + TM;
|
||||
T1a = TY + T19;
|
||||
T1b = TN + T1a;
|
||||
T1N = TN - T1a;
|
||||
T1f = T1d + T1e;
|
||||
T1o = T1i + T1n;
|
||||
T1p = T1f + T1o;
|
||||
T1P = T1o - T1f;
|
||||
{
|
||||
E T1I, T1K, T1H, T1J;
|
||||
T1I = Tf - Tu;
|
||||
T1K = T1E - T1x;
|
||||
T1H = W[14];
|
||||
T1J = W[15];
|
||||
T1L = FNMS(T1J, T1K, T1H * T1I);
|
||||
T1R = FMA(T1J, T1I, T1H * T1K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1q, T1G, Tw, T1c;
|
||||
Tw = W[0];
|
||||
T1c = W[1];
|
||||
T1q = FMA(Tw, T1b, T1c * T1p);
|
||||
T1G = FNMS(T1c, T1b, Tw * T1p);
|
||||
Rp[0] = Tv - T1q;
|
||||
Ip[0] = T1F + T1G;
|
||||
Rm[0] = Tv + T1q;
|
||||
Im[0] = T1G - T1F;
|
||||
}
|
||||
{
|
||||
E T1Q, T1S, T1M, T1O;
|
||||
T1M = W[16];
|
||||
T1O = W[17];
|
||||
T1Q = FMA(T1M, T1N, T1O * T1P);
|
||||
T1S = FNMS(T1O, T1N, T1M * T1P);
|
||||
Rp[WS(rs, 4)] = T1L - T1Q;
|
||||
Ip[WS(rs, 4)] = T1R + T1S;
|
||||
Rm[WS(rs, 4)] = T1L + T1Q;
|
||||
Im[WS(rs, 4)] = T1S - T1R;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
|
||||
{
|
||||
E T23, T24, T27, T28;
|
||||
T23 = TB - TM;
|
||||
T24 = T1d - T1e;
|
||||
T25 = T23 + T24;
|
||||
T2j = T23 - T24;
|
||||
T27 = T19 - TY;
|
||||
T28 = T1n - T1i;
|
||||
T29 = T27 + T28;
|
||||
T2l = T28 - T27;
|
||||
}
|
||||
{
|
||||
E T1W, T20, T1T, T1X;
|
||||
T1W = T1U + T1V;
|
||||
T20 = T1Y + T1Z;
|
||||
T1T = W[6];
|
||||
T1X = W[7];
|
||||
T21 = FNMS(T1X, T20, T1T * T1W);
|
||||
T2b = FMA(T1X, T1W, T1T * T20);
|
||||
}
|
||||
{
|
||||
E T2e, T2g, T2d, T2f;
|
||||
T2e = T1U - T1V;
|
||||
T2g = T1Z - T1Y;
|
||||
T2d = W[22];
|
||||
T2f = W[23];
|
||||
T2h = FNMS(T2f, T2g, T2d * T2e);
|
||||
T2n = FMA(T2f, T2e, T2d * T2g);
|
||||
}
|
||||
{
|
||||
E T2a, T2c, T22, T26;
|
||||
T22 = W[8];
|
||||
T26 = W[9];
|
||||
T2a = FMA(T22, T25, T26 * T29);
|
||||
T2c = FNMS(T26, T25, T22 * T29);
|
||||
Rp[WS(rs, 2)] = T21 - T2a;
|
||||
Ip[WS(rs, 2)] = T2b + T2c;
|
||||
Rm[WS(rs, 2)] = T21 + T2a;
|
||||
Im[WS(rs, 2)] = T2c - T2b;
|
||||
}
|
||||
{
|
||||
E T2m, T2o, T2i, T2k;
|
||||
T2i = W[24];
|
||||
T2k = W[25];
|
||||
T2m = FMA(T2i, T2j, T2k * T2l);
|
||||
T2o = FNMS(T2k, T2j, T2i * T2l);
|
||||
Rp[WS(rs, 6)] = T2h - T2m;
|
||||
Ip[WS(rs, 6)] = T2n + T2o;
|
||||
Rm[WS(rs, 6)] = T2h + T2m;
|
||||
Im[WS(rs, 6)] = T2o - T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
|
||||
T2z = KP707106781 * (T2v + T2y);
|
||||
T2A = T2s + T2z;
|
||||
T38 = T2s - T2z;
|
||||
T2E = KP707106781 * (T2C + T2D);
|
||||
T2I = T2E + T2H;
|
||||
T3a = T2H - T2E;
|
||||
{
|
||||
E T2N, T2U, T2Z, T32;
|
||||
T2N = T2L + T2M;
|
||||
T2U = T2Q - T2T;
|
||||
T2V = T2N + T2U;
|
||||
T3d = T2N - T2U;
|
||||
T2Z = T2X + T2Y;
|
||||
T32 = T30 - T31;
|
||||
T33 = T2Z + T32;
|
||||
T3f = T32 - T2Z;
|
||||
}
|
||||
{
|
||||
E T2J, T35, T34, T36;
|
||||
{
|
||||
E T2p, T2B, T2K, T2W;
|
||||
T2p = W[2];
|
||||
T2B = W[3];
|
||||
T2J = FNMS(T2B, T2I, T2p * T2A);
|
||||
T35 = FMA(T2B, T2A, T2p * T2I);
|
||||
T2K = W[4];
|
||||
T2W = W[5];
|
||||
T34 = FMA(T2K, T2V, T2W * T33);
|
||||
T36 = FNMS(T2W, T2V, T2K * T33);
|
||||
}
|
||||
Rp[WS(rs, 1)] = T2J - T34;
|
||||
Ip[WS(rs, 1)] = T35 + T36;
|
||||
Rm[WS(rs, 1)] = T2J + T34;
|
||||
Im[WS(rs, 1)] = T36 - T35;
|
||||
}
|
||||
{
|
||||
E T3b, T3h, T3g, T3i;
|
||||
{
|
||||
E T37, T39, T3c, T3e;
|
||||
T37 = W[18];
|
||||
T39 = W[19];
|
||||
T3b = FNMS(T39, T3a, T37 * T38);
|
||||
T3h = FMA(T39, T38, T37 * T3a);
|
||||
T3c = W[20];
|
||||
T3e = W[21];
|
||||
T3g = FMA(T3c, T3d, T3e * T3f);
|
||||
T3i = FNMS(T3e, T3d, T3c * T3f);
|
||||
}
|
||||
Rp[WS(rs, 5)] = T3b - T3g;
|
||||
Ip[WS(rs, 5)] = T3h + T3i;
|
||||
Rm[WS(rs, 5)] = T3b + T3g;
|
||||
Im[WS(rs, 5)] = T3i - T3h;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
|
||||
T3l = KP707106781 * (T2D - T2C);
|
||||
T3m = T3k + T3l;
|
||||
T3E = T3k - T3l;
|
||||
T3o = KP707106781 * (T2v - T2y);
|
||||
T3q = T3o + T3p;
|
||||
T3G = T3p - T3o;
|
||||
{
|
||||
E T3t, T3u, T3x, T3y;
|
||||
T3t = T2L - T2M;
|
||||
T3u = T2X - T2Y;
|
||||
T3v = T3t + T3u;
|
||||
T3J = T3t - T3u;
|
||||
T3x = T31 + T30;
|
||||
T3y = T2Q + T2T;
|
||||
T3z = T3x - T3y;
|
||||
T3L = T3y + T3x;
|
||||
}
|
||||
{
|
||||
E T3r, T3B, T3A, T3C;
|
||||
{
|
||||
E T3j, T3n, T3s, T3w;
|
||||
T3j = W[10];
|
||||
T3n = W[11];
|
||||
T3r = FNMS(T3n, T3q, T3j * T3m);
|
||||
T3B = FMA(T3n, T3m, T3j * T3q);
|
||||
T3s = W[12];
|
||||
T3w = W[13];
|
||||
T3A = FMA(T3s, T3v, T3w * T3z);
|
||||
T3C = FNMS(T3w, T3v, T3s * T3z);
|
||||
}
|
||||
Rp[WS(rs, 3)] = T3r - T3A;
|
||||
Ip[WS(rs, 3)] = T3B + T3C;
|
||||
Rm[WS(rs, 3)] = T3r + T3A;
|
||||
Im[WS(rs, 3)] = T3C - T3B;
|
||||
}
|
||||
{
|
||||
E T3H, T3N, T3M, T3O;
|
||||
{
|
||||
E T3D, T3F, T3I, T3K;
|
||||
T3D = W[26];
|
||||
T3F = W[27];
|
||||
T3H = FNMS(T3F, T3G, T3D * T3E);
|
||||
T3N = FMA(T3F, T3E, T3D * T3G);
|
||||
T3I = W[28];
|
||||
T3K = W[29];
|
||||
T3M = FMA(T3I, T3J, T3K * T3L);
|
||||
T3O = FNMS(T3K, T3J, T3I * T3L);
|
||||
}
|
||||
Rp[WS(rs, 7)] = T3H - T3M;
|
||||
Ip[WS(rs, 7)] = T3N + T3O;
|
||||
Rm[WS(rs, 7)] = T3H + T3M;
|
||||
Im[WS(rs, 7)] = T3O - T3N;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cbdft2_16", twinstr, &GENUS, { 168, 46, 38, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft2_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
1149
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_20.c
Normal file
1149
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1950
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_32.c
Normal file
1950
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
218
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_4.c
Normal file
218
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_4.c
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:14 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 30 FP additions, 12 FP multiplications,
|
||||
* (or, 24 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 23 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T3, Tm, T6, Tn, Td, Tk, TB, Ty, Tv, Ts;
|
||||
{
|
||||
E Tg, Tc, T9, Tj;
|
||||
{
|
||||
E T1, T2, Ta, Tb;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tg = T1 - T2;
|
||||
Ta = Ip[0];
|
||||
Tb = Im[WS(rs, 1)];
|
||||
Tc = Ta + Tb;
|
||||
Tm = Ta - Tb;
|
||||
}
|
||||
{
|
||||
E T4, T5, Th, Ti;
|
||||
T4 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
T9 = T4 - T5;
|
||||
Th = Ip[WS(rs, 1)];
|
||||
Ti = Im[0];
|
||||
Tj = Th + Ti;
|
||||
Tn = Th - Ti;
|
||||
}
|
||||
Td = T9 + Tc;
|
||||
Tk = Tg - Tj;
|
||||
TB = Tg + Tj;
|
||||
Ty = Tc - T9;
|
||||
Tv = Tm - Tn;
|
||||
Ts = T3 - T6;
|
||||
}
|
||||
{
|
||||
E T7, To, Te, Tp, T8, Tl, Tq, Tf;
|
||||
T7 = T3 + T6;
|
||||
To = Tm + Tn;
|
||||
T8 = W[0];
|
||||
Te = T8 * Td;
|
||||
Tp = T8 * Tk;
|
||||
Tf = W[1];
|
||||
Tl = FMA(Tf, Tk, Te);
|
||||
Tq = FNMS(Tf, Td, Tp);
|
||||
Rp[0] = T7 - Tl;
|
||||
Ip[0] = To + Tq;
|
||||
Rm[0] = T7 + Tl;
|
||||
Im[0] = Tq - To;
|
||||
}
|
||||
{
|
||||
E Tr, Tt, Tu, TD, Tz, TF, Tx;
|
||||
Tr = W[2];
|
||||
Tt = Tr * Ts;
|
||||
Tu = W[3];
|
||||
TD = Tu * Ts;
|
||||
Tx = W[4];
|
||||
Tz = Tx * Ty;
|
||||
TF = Tx * TB;
|
||||
{
|
||||
E Tw, TE, TC, TG, TA;
|
||||
Tw = FNMS(Tu, Tv, Tt);
|
||||
TE = FMA(Tr, Tv, TD);
|
||||
TA = W[5];
|
||||
TC = FMA(TA, TB, Tz);
|
||||
TG = FNMS(TA, Ty, TF);
|
||||
Rp[WS(rs, 1)] = Tw - TC;
|
||||
Ip[WS(rs, 1)] = TE + TG;
|
||||
Rm[WS(rs, 1)] = Tw + TC;
|
||||
Im[WS(rs, 1)] = TG - TE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, { 24, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 30 FP additions, 12 FP multiplications,
|
||||
* (or, 24 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 19 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T3, Tl, T6, Tm, Td, Tj, Tx, Tv, Ts, Tq;
|
||||
{
|
||||
E Tf, Tc, T9, Ti;
|
||||
{
|
||||
E T1, T2, Ta, Tb;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
Tf = T1 - T2;
|
||||
Ta = Ip[0];
|
||||
Tb = Im[WS(rs, 1)];
|
||||
Tc = Ta + Tb;
|
||||
Tl = Ta - Tb;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tg, Th;
|
||||
T4 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[0];
|
||||
T6 = T4 + T5;
|
||||
T9 = T4 - T5;
|
||||
Tg = Ip[WS(rs, 1)];
|
||||
Th = Im[0];
|
||||
Ti = Tg + Th;
|
||||
Tm = Tg - Th;
|
||||
}
|
||||
Td = T9 + Tc;
|
||||
Tj = Tf - Ti;
|
||||
Tx = Tf + Ti;
|
||||
Tv = Tc - T9;
|
||||
Ts = Tl - Tm;
|
||||
Tq = T3 - T6;
|
||||
}
|
||||
{
|
||||
E T7, Tn, Tk, To, T8, Te;
|
||||
T7 = T3 + T6;
|
||||
Tn = Tl + Tm;
|
||||
T8 = W[0];
|
||||
Te = W[1];
|
||||
Tk = FMA(T8, Td, Te * Tj);
|
||||
To = FNMS(Te, Td, T8 * Tj);
|
||||
Rp[0] = T7 - Tk;
|
||||
Ip[0] = Tn + To;
|
||||
Rm[0] = T7 + Tk;
|
||||
Im[0] = To - Tn;
|
||||
}
|
||||
{
|
||||
E Tt, Tz, Ty, TA;
|
||||
{
|
||||
E Tp, Tr, Tu, Tw;
|
||||
Tp = W[2];
|
||||
Tr = W[3];
|
||||
Tt = FNMS(Tr, Ts, Tp * Tq);
|
||||
Tz = FMA(Tr, Tq, Tp * Ts);
|
||||
Tu = W[4];
|
||||
Tw = W[5];
|
||||
Ty = FMA(Tu, Tv, Tw * Tx);
|
||||
TA = FNMS(Tw, Tv, Tu * Tx);
|
||||
}
|
||||
Rp[WS(rs, 1)] = Tt - Ty;
|
||||
Ip[WS(rs, 1)] = Tz + TA;
|
||||
Rm[WS(rs, 1)] = Tt + Ty;
|
||||
Im[WS(rs, 1)] = TA - Tz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, { 24, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
424
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_8.c
Normal file
424
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft2_8.c
Normal file
@@ -0,0 +1,424 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:14 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 82 FP additions, 36 FP multiplications,
|
||||
* (or, 60 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 41 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Tl, T1p, T1g, TM, T1k, TE, TP, T1f, T7, Te, TU, TH, T1l, Tw, T1q;
|
||||
E T1c, T1y;
|
||||
{
|
||||
E T3, TA, Tk, TN, T6, Th, TD, TO, Ta, Tm, Tp, TK, Td, Tr, Tu;
|
||||
E TL, TF, TG;
|
||||
{
|
||||
E T1, T2, Ti, Tj;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
TA = T1 - T2;
|
||||
Ti = Ip[0];
|
||||
Tj = Im[WS(rs, 3)];
|
||||
Tk = Ti + Tj;
|
||||
TN = Ti - Tj;
|
||||
}
|
||||
{
|
||||
E T4, T5, TB, TC;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
Th = T4 - T5;
|
||||
TB = Ip[WS(rs, 2)];
|
||||
TC = Im[WS(rs, 1)];
|
||||
TD = TB + TC;
|
||||
TO = TB - TC;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tn, To;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tm = T8 - T9;
|
||||
Tn = Ip[WS(rs, 1)];
|
||||
To = Im[WS(rs, 2)];
|
||||
Tp = Tn + To;
|
||||
TK = Tn - To;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Ts, Tt;
|
||||
Tb = Rm[0];
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
Tr = Tb - Tc;
|
||||
Ts = Im[0];
|
||||
Tt = Ip[WS(rs, 3)];
|
||||
Tu = Ts + Tt;
|
||||
TL = Tt - Ts;
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
T1p = TA + TD;
|
||||
T1g = TN - TO;
|
||||
TM = TK + TL;
|
||||
T1k = Tk - Th;
|
||||
TE = TA - TD;
|
||||
TP = TN + TO;
|
||||
T1f = Ta - Td;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
TU = T7 - Te;
|
||||
TF = Tm - Tp;
|
||||
TG = Tr - Tu;
|
||||
TH = TF + TG;
|
||||
T1l = TF - TG;
|
||||
{
|
||||
E Tq, Tv, T1a, T1b;
|
||||
Tq = Tm + Tp;
|
||||
Tv = Tr + Tu;
|
||||
Tw = Tq - Tv;
|
||||
T1q = Tq + Tv;
|
||||
T1a = T3 - T6;
|
||||
T1b = TL - TK;
|
||||
T1c = T1a + T1b;
|
||||
T1y = T1a - T1b;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, TQ, Tx, TI, Ty, TR, Tg, TJ, TS, Tz;
|
||||
Tf = T7 + Te;
|
||||
TQ = TM + TP;
|
||||
Tx = FMA(KP707106781, Tw, Tl);
|
||||
TI = FMA(KP707106781, TH, TE);
|
||||
Tg = W[0];
|
||||
Ty = Tg * Tx;
|
||||
TR = Tg * TI;
|
||||
Tz = W[1];
|
||||
TJ = FMA(Tz, TI, Ty);
|
||||
TS = FNMS(Tz, Tx, TR);
|
||||
Rp[0] = Tf - TJ;
|
||||
Ip[0] = TQ + TS;
|
||||
Rm[0] = Tf + TJ;
|
||||
Im[0] = TS - TQ;
|
||||
}
|
||||
{
|
||||
E T1B, T1A, T1J, T1x, T1z, T1E, T1H, T1F, T1L, T1D;
|
||||
T1B = T1g - T1f;
|
||||
T1A = W[11];
|
||||
T1J = T1A * T1y;
|
||||
T1x = W[10];
|
||||
T1z = T1x * T1y;
|
||||
T1E = FNMS(KP707106781, T1l, T1k);
|
||||
T1H = FMA(KP707106781, T1q, T1p);
|
||||
T1D = W[12];
|
||||
T1F = T1D * T1E;
|
||||
T1L = T1D * T1H;
|
||||
{
|
||||
E T1C, T1K, T1I, T1M, T1G;
|
||||
T1C = FNMS(T1A, T1B, T1z);
|
||||
T1K = FMA(T1x, T1B, T1J);
|
||||
T1G = W[13];
|
||||
T1I = FMA(T1G, T1H, T1F);
|
||||
T1M = FNMS(T1G, T1E, T1L);
|
||||
Rp[WS(rs, 3)] = T1C - T1I;
|
||||
Ip[WS(rs, 3)] = T1K + T1M;
|
||||
Rm[WS(rs, 3)] = T1C + T1I;
|
||||
Im[WS(rs, 3)] = T1M - T1K;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TX, TW, T15, TT, TV, T10, T13, T11, T17, TZ;
|
||||
TX = TP - TM;
|
||||
TW = W[7];
|
||||
T15 = TW * TU;
|
||||
TT = W[6];
|
||||
TV = TT * TU;
|
||||
T10 = FNMS(KP707106781, Tw, Tl);
|
||||
T13 = FNMS(KP707106781, TH, TE);
|
||||
TZ = W[8];
|
||||
T11 = TZ * T10;
|
||||
T17 = TZ * T13;
|
||||
{
|
||||
E TY, T16, T14, T18, T12;
|
||||
TY = FNMS(TW, TX, TV);
|
||||
T16 = FMA(TT, TX, T15);
|
||||
T12 = W[9];
|
||||
T14 = FMA(T12, T13, T11);
|
||||
T18 = FNMS(T12, T10, T17);
|
||||
Rp[WS(rs, 2)] = TY - T14;
|
||||
Ip[WS(rs, 2)] = T16 + T18;
|
||||
Rm[WS(rs, 2)] = TY + T14;
|
||||
Im[WS(rs, 2)] = T18 - T16;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1h, T1e, T1t, T19, T1d, T1m, T1r, T1n, T1v, T1j;
|
||||
T1h = T1f + T1g;
|
||||
T1e = W[3];
|
||||
T1t = T1e * T1c;
|
||||
T19 = W[2];
|
||||
T1d = T19 * T1c;
|
||||
T1m = FMA(KP707106781, T1l, T1k);
|
||||
T1r = FNMS(KP707106781, T1q, T1p);
|
||||
T1j = W[4];
|
||||
T1n = T1j * T1m;
|
||||
T1v = T1j * T1r;
|
||||
{
|
||||
E T1i, T1u, T1s, T1w, T1o;
|
||||
T1i = FNMS(T1e, T1h, T1d);
|
||||
T1u = FMA(T19, T1h, T1t);
|
||||
T1o = W[5];
|
||||
T1s = FMA(T1o, T1r, T1n);
|
||||
T1w = FNMS(T1o, T1m, T1v);
|
||||
Rp[WS(rs, 1)] = T1i - T1s;
|
||||
Ip[WS(rs, 1)] = T1u + T1w;
|
||||
Rm[WS(rs, 1)] = T1i + T1s;
|
||||
Im[WS(rs, 1)] = T1w - T1u;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, { 60, 14, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 82 FP additions, 32 FP multiplications,
|
||||
* (or, 68 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 30 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw;
|
||||
E T1i;
|
||||
{
|
||||
E T3, TC, Tk, TM, T6, Th, TF, TN;
|
||||
{
|
||||
E T1, T2, Ti, Tj;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 3)];
|
||||
T3 = T1 + T2;
|
||||
TC = T1 - T2;
|
||||
Ti = Ip[0];
|
||||
Tj = Im[WS(rs, 3)];
|
||||
Tk = Ti + Tj;
|
||||
TM = Ti - Tj;
|
||||
}
|
||||
{
|
||||
E T4, T5, TD, TE;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
Th = T4 - T5;
|
||||
TD = Ip[WS(rs, 2)];
|
||||
TE = Im[WS(rs, 1)];
|
||||
TF = TD + TE;
|
||||
TN = TD - TE;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1d = Tk - Th;
|
||||
T1h = TC + TF;
|
||||
Tl = Th + Tk;
|
||||
TG = TC - TF;
|
||||
T14 = T3 - T6;
|
||||
T19 = TM - TN;
|
||||
TO = TM + TN;
|
||||
}
|
||||
{
|
||||
E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK;
|
||||
{
|
||||
E T8, T9, Tn, To;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
Tm = T8 - T9;
|
||||
Tn = Ip[WS(rs, 1)];
|
||||
To = Im[WS(rs, 2)];
|
||||
Tp = Tn + To;
|
||||
TJ = Tn - To;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Ts, Tt;
|
||||
Tb = Rm[0];
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
Tr = Tb - Tc;
|
||||
Ts = Im[0];
|
||||
Tt = Ip[WS(rs, 3)];
|
||||
Tu = Ts + Tt;
|
||||
TK = Tt - Ts;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TL = TJ + TK;
|
||||
T18 = Ta - Td;
|
||||
T15 = TK - TJ;
|
||||
{
|
||||
E Tz, TA, Tq, Tv;
|
||||
Tz = Tm - Tp;
|
||||
TA = Tr - Tu;
|
||||
TB = KP707106781 * (Tz + TA);
|
||||
T1e = KP707106781 * (Tz - TA);
|
||||
Tq = Tm + Tp;
|
||||
Tv = Tr + Tu;
|
||||
Tw = KP707106781 * (Tq - Tv);
|
||||
T1i = KP707106781 * (Tq + Tv);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, TP, TI, TQ;
|
||||
Tf = T7 + Te;
|
||||
TP = TL + TO;
|
||||
{
|
||||
E Tx, TH, Tg, Ty;
|
||||
Tx = Tl + Tw;
|
||||
TH = TB + TG;
|
||||
Tg = W[0];
|
||||
Ty = W[1];
|
||||
TI = FMA(Tg, Tx, Ty * TH);
|
||||
TQ = FNMS(Ty, Tx, Tg * TH);
|
||||
}
|
||||
Rp[0] = Tf - TI;
|
||||
Ip[0] = TP + TQ;
|
||||
Rm[0] = Tf + TI;
|
||||
Im[0] = TQ - TP;
|
||||
}
|
||||
{
|
||||
E T1r, T1x, T1w, T1y;
|
||||
{
|
||||
E T1o, T1q, T1n, T1p;
|
||||
T1o = T14 - T15;
|
||||
T1q = T19 - T18;
|
||||
T1n = W[10];
|
||||
T1p = W[11];
|
||||
T1r = FNMS(T1p, T1q, T1n * T1o);
|
||||
T1x = FMA(T1p, T1o, T1n * T1q);
|
||||
}
|
||||
{
|
||||
E T1t, T1v, T1s, T1u;
|
||||
T1t = T1d - T1e;
|
||||
T1v = T1i + T1h;
|
||||
T1s = W[12];
|
||||
T1u = W[13];
|
||||
T1w = FMA(T1s, T1t, T1u * T1v);
|
||||
T1y = FNMS(T1u, T1t, T1s * T1v);
|
||||
}
|
||||
Rp[WS(rs, 3)] = T1r - T1w;
|
||||
Ip[WS(rs, 3)] = T1x + T1y;
|
||||
Rm[WS(rs, 3)] = T1r + T1w;
|
||||
Im[WS(rs, 3)] = T1y - T1x;
|
||||
}
|
||||
{
|
||||
E TV, T11, T10, T12;
|
||||
{
|
||||
E TS, TU, TR, TT;
|
||||
TS = T7 - Te;
|
||||
TU = TO - TL;
|
||||
TR = W[6];
|
||||
TT = W[7];
|
||||
TV = FNMS(TT, TU, TR * TS);
|
||||
T11 = FMA(TT, TS, TR * TU);
|
||||
}
|
||||
{
|
||||
E TX, TZ, TW, TY;
|
||||
TX = Tl - Tw;
|
||||
TZ = TG - TB;
|
||||
TW = W[8];
|
||||
TY = W[9];
|
||||
T10 = FMA(TW, TX, TY * TZ);
|
||||
T12 = FNMS(TY, TX, TW * TZ);
|
||||
}
|
||||
Rp[WS(rs, 2)] = TV - T10;
|
||||
Ip[WS(rs, 2)] = T11 + T12;
|
||||
Rm[WS(rs, 2)] = TV + T10;
|
||||
Im[WS(rs, 2)] = T12 - T11;
|
||||
}
|
||||
{
|
||||
E T1b, T1l, T1k, T1m;
|
||||
{
|
||||
E T16, T1a, T13, T17;
|
||||
T16 = T14 + T15;
|
||||
T1a = T18 + T19;
|
||||
T13 = W[2];
|
||||
T17 = W[3];
|
||||
T1b = FNMS(T17, T1a, T13 * T16);
|
||||
T1l = FMA(T17, T16, T13 * T1a);
|
||||
}
|
||||
{
|
||||
E T1f, T1j, T1c, T1g;
|
||||
T1f = T1d + T1e;
|
||||
T1j = T1h - T1i;
|
||||
T1c = W[4];
|
||||
T1g = W[5];
|
||||
T1k = FMA(T1c, T1f, T1g * T1j);
|
||||
T1m = FNMS(T1g, T1f, T1c * T1j);
|
||||
}
|
||||
Rp[WS(rs, 1)] = T1b - T1k;
|
||||
Ip[WS(rs, 1)] = T1l + T1m;
|
||||
Rm[WS(rs, 1)] = T1b + T1k;
|
||||
Im[WS(rs, 1)] = T1m - T1l;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, { 68, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
545
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_10.c
Normal file
545
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_10.c
Normal file
@@ -0,0 +1,545 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 122 FP additions, 72 FP multiplications,
|
||||
* (or, 68 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 91 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E T3, Tl, Tu, T14, Ti, T13, Ts, Tt, T1p, T23, TZ, T1z, TQ, T1g, TV;
|
||||
E T1l, TT, TU, T1j, T1k, T1c, T1Y, TK, T1u;
|
||||
{
|
||||
E Td, Tp, Tg, Tq, Th, Tr, T6, Tm, T9, Tn, Ta, To, T1, T2;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
Tl = T1 - T2;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = Rp[WS(rs, 4)];
|
||||
Tc = Rm[0];
|
||||
Td = Tb + Tc;
|
||||
Tp = Tb - Tc;
|
||||
Te = Rm[WS(rs, 3)];
|
||||
Tf = Rp[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
Tq = Te - Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
Tr = Tp + Tq;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 2)];
|
||||
T6 = T4 + T5;
|
||||
Tm = T4 - T5;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
Tn = T7 - T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
To = Tm + Tn;
|
||||
Tu = To - Tr;
|
||||
T14 = Ta - Th;
|
||||
Ti = Ta + Th;
|
||||
T13 = FNMS(KP250000000, Ti, T3);
|
||||
Ts = To + Tr;
|
||||
Tt = FNMS(KP250000000, Ts, Tl);
|
||||
{
|
||||
E T1n, T1o, TX, TY;
|
||||
T1n = Td - Tg;
|
||||
T1o = T6 - T9;
|
||||
T1p = FNMS(KP618033988, T1o, T1n);
|
||||
T23 = FMA(KP618033988, T1n, T1o);
|
||||
TX = Tm - Tn;
|
||||
TY = Tp - Tq;
|
||||
TZ = FMA(KP618033988, TY, TX);
|
||||
T1z = FNMS(KP618033988, TX, TY);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TF, T16, TI, T17, TS, T1i, Ty, T19, TB, T1a, TR, T1h, TO, TP;
|
||||
TO = Ip[0];
|
||||
TP = Im[WS(rs, 4)];
|
||||
TQ = TO + TP;
|
||||
T1g = TO - TP;
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = Ip[WS(rs, 4)];
|
||||
TE = Im[0];
|
||||
TF = TD + TE;
|
||||
T16 = TD - TE;
|
||||
TG = Im[WS(rs, 3)];
|
||||
TH = Ip[WS(rs, 1)];
|
||||
TI = TG + TH;
|
||||
T17 = TH - TG;
|
||||
}
|
||||
TS = TF - TI;
|
||||
T1i = T16 + T17;
|
||||
{
|
||||
E Tw, Tx, Tz, TA;
|
||||
Tw = Ip[WS(rs, 2)];
|
||||
Tx = Im[WS(rs, 2)];
|
||||
Ty = Tw + Tx;
|
||||
T19 = Tw - Tx;
|
||||
Tz = Im[WS(rs, 1)];
|
||||
TA = Ip[WS(rs, 3)];
|
||||
TB = Tz + TA;
|
||||
T1a = TA - Tz;
|
||||
}
|
||||
TR = Ty - TB;
|
||||
T1h = T19 + T1a;
|
||||
TV = TR - TS;
|
||||
T1l = T1h - T1i;
|
||||
TT = TR + TS;
|
||||
TU = FNMS(KP250000000, TT, TQ);
|
||||
T1j = T1h + T1i;
|
||||
T1k = FNMS(KP250000000, T1j, T1g);
|
||||
{
|
||||
E T18, T1b, TC, TJ;
|
||||
T18 = T16 - T17;
|
||||
T1b = T19 - T1a;
|
||||
T1c = FNMS(KP618033988, T1b, T18);
|
||||
T1Y = FMA(KP618033988, T18, T1b);
|
||||
TC = Ty + TB;
|
||||
TJ = TF + TI;
|
||||
TK = FMA(KP618033988, TJ, TC);
|
||||
T1u = FNMS(KP618033988, TC, TJ);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tj, T2y, T2a, T1A, T2q, T10, T1Q, T24, T2k, T1q, T1K, T26, T28, T29, T2c;
|
||||
E Tk, TM, TN, T2w, T1M, T1O, T1P, T1S, T1s, T1w, T1x, T1C, T2m, T2o, T2p;
|
||||
E T2s, T12, T1e, T1f, T1E, T1G, T1I, T1J, T1U, T1W, T20, T21, T2e, T2g, T2i;
|
||||
E T2j, T2u, T1y, TW, T22, T2l, T2r;
|
||||
Tj = T3 + Ti;
|
||||
T2y = T1g + T1j;
|
||||
T2a = TQ + TT;
|
||||
T1y = FNMS(KP559016994, TV, TU);
|
||||
T1A = FMA(KP951056516, T1z, T1y);
|
||||
T2q = FNMS(KP951056516, T1z, T1y);
|
||||
TW = FMA(KP559016994, TV, TU);
|
||||
T10 = FMA(KP951056516, TZ, TW);
|
||||
T1Q = FNMS(KP951056516, TZ, TW);
|
||||
T22 = FMA(KP559016994, T1l, T1k);
|
||||
T24 = FNMS(KP951056516, T23, T22);
|
||||
T2k = FMA(KP951056516, T23, T22);
|
||||
{
|
||||
E T1m, T1v, T2n, T1t;
|
||||
T1m = FNMS(KP559016994, T1l, T1k);
|
||||
T1q = FNMS(KP951056516, T1p, T1m);
|
||||
T1K = FMA(KP951056516, T1p, T1m);
|
||||
{
|
||||
E T27, TL, T1N, Tv;
|
||||
T27 = Tl + Ts;
|
||||
T26 = W[9];
|
||||
T28 = T26 * T27;
|
||||
T29 = W[8];
|
||||
T2c = T29 * T27;
|
||||
Tv = FMA(KP559016994, Tu, Tt);
|
||||
TL = FNMS(KP951056516, TK, Tv);
|
||||
T1N = FMA(KP951056516, TK, Tv);
|
||||
Tk = W[1];
|
||||
TM = Tk * TL;
|
||||
TN = W[0];
|
||||
T2w = TN * TL;
|
||||
T1M = W[17];
|
||||
T1O = T1M * T1N;
|
||||
T1P = W[16];
|
||||
T1S = T1P * T1N;
|
||||
}
|
||||
T1t = FNMS(KP559016994, Tu, Tt);
|
||||
T1v = FNMS(KP951056516, T1u, T1t);
|
||||
T2n = FMA(KP951056516, T1u, T1t);
|
||||
T1s = W[5];
|
||||
T1w = T1s * T1v;
|
||||
T1x = W[4];
|
||||
T1C = T1x * T1v;
|
||||
T2m = W[13];
|
||||
T2o = T2m * T2n;
|
||||
T2p = W[12];
|
||||
T2s = T2p * T2n;
|
||||
{
|
||||
E T1d, T1H, T15, T1Z, T2h, T1X;
|
||||
T15 = FNMS(KP559016994, T14, T13);
|
||||
T1d = FMA(KP951056516, T1c, T15);
|
||||
T1H = FNMS(KP951056516, T1c, T15);
|
||||
T12 = W[2];
|
||||
T1e = T12 * T1d;
|
||||
T1f = W[3];
|
||||
T1E = T1f * T1d;
|
||||
T1G = W[14];
|
||||
T1I = T1G * T1H;
|
||||
T1J = W[15];
|
||||
T1U = T1J * T1H;
|
||||
T1X = FMA(KP559016994, T14, T13);
|
||||
T1Z = FMA(KP951056516, T1Y, T1X);
|
||||
T2h = FNMS(KP951056516, T1Y, T1X);
|
||||
T1W = W[6];
|
||||
T20 = T1W * T1Z;
|
||||
T21 = W[7];
|
||||
T2e = T21 * T1Z;
|
||||
T2g = W[10];
|
||||
T2i = T2g * T2h;
|
||||
T2j = W[11];
|
||||
T2u = T2j * T2h;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T11, T2x, T1r, T1B;
|
||||
T11 = FMA(TN, T10, TM);
|
||||
Rp[0] = Tj - T11;
|
||||
Rm[0] = Tj + T11;
|
||||
T2x = FNMS(Tk, T10, T2w);
|
||||
Im[0] = T2x - T2y;
|
||||
Ip[0] = T2x + T2y;
|
||||
T1r = FNMS(T1f, T1q, T1e);
|
||||
T1B = FMA(T1x, T1A, T1w);
|
||||
Rp[WS(rs, 1)] = T1r - T1B;
|
||||
Rm[WS(rs, 1)] = T1B + T1r;
|
||||
{
|
||||
E T1D, T1F, T1L, T1R;
|
||||
T1D = FNMS(T1s, T1A, T1C);
|
||||
T1F = FMA(T12, T1q, T1E);
|
||||
Im[WS(rs, 1)] = T1D - T1F;
|
||||
Ip[WS(rs, 1)] = T1D + T1F;
|
||||
T1L = FNMS(T1J, T1K, T1I);
|
||||
T1R = FMA(T1P, T1Q, T1O);
|
||||
Rp[WS(rs, 4)] = T1L - T1R;
|
||||
Rm[WS(rs, 4)] = T1R + T1L;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1T, T1V, T2t, T2v;
|
||||
T1T = FNMS(T1M, T1Q, T1S);
|
||||
T1V = FMA(T1G, T1K, T1U);
|
||||
Im[WS(rs, 4)] = T1T - T1V;
|
||||
Ip[WS(rs, 4)] = T1T + T1V;
|
||||
T2t = FNMS(T2m, T2q, T2s);
|
||||
T2v = FMA(T2g, T2k, T2u);
|
||||
Im[WS(rs, 3)] = T2t - T2v;
|
||||
Ip[WS(rs, 3)] = T2t + T2v;
|
||||
}
|
||||
T2l = FNMS(T2j, T2k, T2i);
|
||||
T2r = FMA(T2p, T2q, T2o);
|
||||
Rp[WS(rs, 3)] = T2l - T2r;
|
||||
Rm[WS(rs, 3)] = T2r + T2l;
|
||||
{
|
||||
E T25, T2b, T2d, T2f;
|
||||
T25 = FNMS(T21, T24, T20);
|
||||
T2b = FMA(T29, T2a, T28);
|
||||
Rp[WS(rs, 2)] = T25 - T2b;
|
||||
Rm[WS(rs, 2)] = T2b + T25;
|
||||
T2d = FNMS(T26, T2a, T2c);
|
||||
T2f = FMA(T1W, T24, T2e);
|
||||
Im[WS(rs, 2)] = T2d - T2f;
|
||||
Ip[WS(rs, 2)] = T2d + T2f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, { 68, 18, 54, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 122 FP additions, 60 FP multiplications,
|
||||
* (or, 92 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 61 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E T3, TS, TR, T13, Ti, T12, TT, TU, T1g, T1T, Tr, T1s, TJ, T1h, TG;
|
||||
E T1m, TK, TL, T1k, T1l, T1b, T1P, TY, T1w;
|
||||
{
|
||||
E Td, To, Tg, Tp, Th, TQ, T6, Tl, T9, Tm, Ta, TP, T1, T2;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
TS = T1 - T2;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = Rp[WS(rs, 4)];
|
||||
Tc = Rm[0];
|
||||
Td = Tb + Tc;
|
||||
To = Tb - Tc;
|
||||
Te = Rm[WS(rs, 3)];
|
||||
Tf = Rp[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
Tp = Te - Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
TQ = To + Tp;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 2)];
|
||||
T6 = T4 + T5;
|
||||
Tl = T4 - T5;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
Tm = T7 - T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TP = Tl + Tm;
|
||||
TR = KP559016994 * (TP - TQ);
|
||||
T13 = KP559016994 * (Ta - Th);
|
||||
Ti = Ta + Th;
|
||||
T12 = FNMS(KP250000000, Ti, T3);
|
||||
TT = TP + TQ;
|
||||
TU = FNMS(KP250000000, TT, TS);
|
||||
{
|
||||
E T1e, T1f, Tn, Tq;
|
||||
T1e = T6 - T9;
|
||||
T1f = Td - Tg;
|
||||
T1g = FNMS(KP951056516, T1f, KP587785252 * T1e);
|
||||
T1T = FMA(KP951056516, T1e, KP587785252 * T1f);
|
||||
Tn = Tl - Tm;
|
||||
Tq = To - Tp;
|
||||
Tr = FMA(KP951056516, Tn, KP587785252 * Tq);
|
||||
T1s = FNMS(KP951056516, Tq, KP587785252 * Tn);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, T18, TE, T19, TF, T1j, Tu, T15, Tx, T16, Ty, T1i, TH, TI;
|
||||
TH = Ip[0];
|
||||
TI = Im[WS(rs, 4)];
|
||||
TJ = TH + TI;
|
||||
T1h = TH - TI;
|
||||
{
|
||||
E Tz, TA, TC, TD;
|
||||
Tz = Ip[WS(rs, 4)];
|
||||
TA = Im[0];
|
||||
TB = Tz + TA;
|
||||
T18 = Tz - TA;
|
||||
TC = Im[WS(rs, 3)];
|
||||
TD = Ip[WS(rs, 1)];
|
||||
TE = TC + TD;
|
||||
T19 = TD - TC;
|
||||
}
|
||||
TF = TB - TE;
|
||||
T1j = T18 + T19;
|
||||
{
|
||||
E Ts, Tt, Tv, Tw;
|
||||
Ts = Ip[WS(rs, 2)];
|
||||
Tt = Im[WS(rs, 2)];
|
||||
Tu = Ts + Tt;
|
||||
T15 = Ts - Tt;
|
||||
Tv = Im[WS(rs, 1)];
|
||||
Tw = Ip[WS(rs, 3)];
|
||||
Tx = Tv + Tw;
|
||||
T16 = Tw - Tv;
|
||||
}
|
||||
Ty = Tu - Tx;
|
||||
T1i = T15 + T16;
|
||||
TG = KP559016994 * (Ty - TF);
|
||||
T1m = KP559016994 * (T1i - T1j);
|
||||
TK = Ty + TF;
|
||||
TL = FNMS(KP250000000, TK, TJ);
|
||||
T1k = T1i + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1h);
|
||||
{
|
||||
E T17, T1a, TW, TX;
|
||||
T17 = T15 - T16;
|
||||
T1a = T18 - T19;
|
||||
T1b = FNMS(KP951056516, T1a, KP587785252 * T17);
|
||||
T1P = FMA(KP951056516, T17, KP587785252 * T1a);
|
||||
TW = Tu + Tx;
|
||||
TX = TB + TE;
|
||||
TY = FMA(KP951056516, TW, KP587785252 * TX);
|
||||
T1w = FNMS(KP951056516, TX, KP587785252 * TW);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tj, T2g, TN, T1H, T1U, T26, TZ, T1J, T1Q, T24, T1c, T1C, T1t, T29, T1o;
|
||||
E T1E, T1x, T2b, T20, T21, TM, T1S, TV;
|
||||
Tj = T3 + Ti;
|
||||
T2g = T1h + T1k;
|
||||
TM = TG + TL;
|
||||
TN = Tr + TM;
|
||||
T1H = TM - Tr;
|
||||
T1S = T1m + T1l;
|
||||
T1U = T1S - T1T;
|
||||
T26 = T1T + T1S;
|
||||
TV = TR + TU;
|
||||
TZ = TV - TY;
|
||||
T1J = TV + TY;
|
||||
{
|
||||
E T1O, T14, T1r, T1n, T1v;
|
||||
T1O = T13 + T12;
|
||||
T1Q = T1O + T1P;
|
||||
T24 = T1O - T1P;
|
||||
T14 = T12 - T13;
|
||||
T1c = T14 - T1b;
|
||||
T1C = T14 + T1b;
|
||||
T1r = TL - TG;
|
||||
T1t = T1r - T1s;
|
||||
T29 = T1s + T1r;
|
||||
T1n = T1l - T1m;
|
||||
T1o = T1g + T1n;
|
||||
T1E = T1n - T1g;
|
||||
T1v = TU - TR;
|
||||
T1x = T1v + T1w;
|
||||
T2b = T1v - T1w;
|
||||
{
|
||||
E T1X, T1Z, T1W, T1Y;
|
||||
T1X = TS + TT;
|
||||
T1Z = TJ + TK;
|
||||
T1W = W[9];
|
||||
T1Y = W[8];
|
||||
T20 = FMA(T1W, T1X, T1Y * T1Z);
|
||||
T21 = FNMS(T1W, T1Z, T1Y * T1X);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T10, T2f, Tk, TO;
|
||||
Tk = W[0];
|
||||
TO = W[1];
|
||||
T10 = FMA(Tk, TN, TO * TZ);
|
||||
T2f = FNMS(TO, TN, Tk * TZ);
|
||||
Rp[0] = Tj - T10;
|
||||
Ip[0] = T2f + T2g;
|
||||
Rm[0] = Tj + T10;
|
||||
Im[0] = T2f - T2g;
|
||||
}
|
||||
{
|
||||
E T1V, T22, T1N, T1R;
|
||||
T1N = W[6];
|
||||
T1R = W[7];
|
||||
T1V = FNMS(T1R, T1U, T1N * T1Q);
|
||||
T22 = FMA(T1R, T1Q, T1N * T1U);
|
||||
Rp[WS(rs, 2)] = T1V - T20;
|
||||
Ip[WS(rs, 2)] = T21 + T22;
|
||||
Rm[WS(rs, 2)] = T20 + T1V;
|
||||
Im[WS(rs, 2)] = T21 - T22;
|
||||
}
|
||||
{
|
||||
E T1p, T1A, T1y, T1z;
|
||||
{
|
||||
E T11, T1d, T1q, T1u;
|
||||
T11 = W[2];
|
||||
T1d = W[3];
|
||||
T1p = FNMS(T1d, T1o, T11 * T1c);
|
||||
T1A = FMA(T1d, T1c, T11 * T1o);
|
||||
T1q = W[4];
|
||||
T1u = W[5];
|
||||
T1y = FMA(T1q, T1t, T1u * T1x);
|
||||
T1z = FNMS(T1u, T1t, T1q * T1x);
|
||||
}
|
||||
Rp[WS(rs, 1)] = T1p - T1y;
|
||||
Ip[WS(rs, 1)] = T1z + T1A;
|
||||
Rm[WS(rs, 1)] = T1y + T1p;
|
||||
Im[WS(rs, 1)] = T1z - T1A;
|
||||
}
|
||||
{
|
||||
E T1F, T1M, T1K, T1L;
|
||||
{
|
||||
E T1B, T1D, T1G, T1I;
|
||||
T1B = W[14];
|
||||
T1D = W[15];
|
||||
T1F = FNMS(T1D, T1E, T1B * T1C);
|
||||
T1M = FMA(T1D, T1C, T1B * T1E);
|
||||
T1G = W[16];
|
||||
T1I = W[17];
|
||||
T1K = FMA(T1G, T1H, T1I * T1J);
|
||||
T1L = FNMS(T1I, T1H, T1G * T1J);
|
||||
}
|
||||
Rp[WS(rs, 4)] = T1F - T1K;
|
||||
Ip[WS(rs, 4)] = T1L + T1M;
|
||||
Rm[WS(rs, 4)] = T1K + T1F;
|
||||
Im[WS(rs, 4)] = T1L - T1M;
|
||||
}
|
||||
{
|
||||
E T27, T2e, T2c, T2d;
|
||||
{
|
||||
E T23, T25, T28, T2a;
|
||||
T23 = W[10];
|
||||
T25 = W[11];
|
||||
T27 = FNMS(T25, T26, T23 * T24);
|
||||
T2e = FMA(T25, T24, T23 * T26);
|
||||
T28 = W[12];
|
||||
T2a = W[13];
|
||||
T2c = FMA(T28, T29, T2a * T2b);
|
||||
T2d = FNMS(T2a, T29, T28 * T2b);
|
||||
}
|
||||
Rp[WS(rs, 3)] = T27 - T2c;
|
||||
Ip[WS(rs, 3)] = T2d + T2e;
|
||||
Rm[WS(rs, 3)] = T2c + T27;
|
||||
Im[WS(rs, 3)] = T2d - T2e;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, { 92, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
643
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_12.c
Normal file
643
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_12.c
Normal file
@@ -0,0 +1,643 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 142 FP additions, 68 FP multiplications,
|
||||
* (or, 96 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 55 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E Tv, TC, TD, T1L, T1M, T2y, Tb, T1Z, T1E, T2D, T1e, T1U, TY, T2o, T13;
|
||||
E T18, T19, T1O, T1P, T2E, Tm, T1V, T1H, T2z, T1h, T20, TO, T2p;
|
||||
{
|
||||
E T1, T4, Tu, TS, Tp, Ts, Tt, TT, T6, T9, TB, TV, Tw, Tz, TA;
|
||||
E TW;
|
||||
{
|
||||
E T2, T3, Tq, Tr;
|
||||
T1 = Rp[0];
|
||||
T2 = Rp[WS(rs, 4)];
|
||||
T3 = Rm[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
Tu = T2 - T3;
|
||||
TS = FNMS(KP500000000, T4, T1);
|
||||
Tp = Ip[0];
|
||||
Tq = Ip[WS(rs, 4)];
|
||||
Tr = Im[WS(rs, 3)];
|
||||
Ts = Tq - Tr;
|
||||
Tt = FNMS(KP500000000, Ts, Tp);
|
||||
TT = Tr + Tq;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tx, Ty;
|
||||
T6 = Rm[WS(rs, 5)];
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
TB = T7 - T8;
|
||||
TV = FNMS(KP500000000, T9, T6);
|
||||
Tw = Im[WS(rs, 5)];
|
||||
Tx = Im[WS(rs, 1)];
|
||||
Ty = Ip[WS(rs, 2)];
|
||||
Tz = Tx - Ty;
|
||||
TA = FNMS(KP500000000, Tz, Tw);
|
||||
TW = Tx + Ty;
|
||||
}
|
||||
{
|
||||
E T5, Ta, T1C, T1D;
|
||||
Tv = FMA(KP866025403, Tu, Tt);
|
||||
TC = FNMS(KP866025403, TB, TA);
|
||||
TD = Tv + TC;
|
||||
T1L = FNMS(KP866025403, Tu, Tt);
|
||||
T1M = FMA(KP866025403, TB, TA);
|
||||
T2y = T1L + T1M;
|
||||
T5 = T1 + T4;
|
||||
Ta = T6 + T9;
|
||||
Tb = T5 + Ta;
|
||||
T1Z = T5 - Ta;
|
||||
T1C = FMA(KP866025403, TT, TS);
|
||||
T1D = FNMS(KP866025403, TW, TV);
|
||||
T1E = T1C + T1D;
|
||||
T2D = T1C - T1D;
|
||||
{
|
||||
E T1c, T1d, TU, TX;
|
||||
T1c = Tp + Ts;
|
||||
T1d = Tw + Tz;
|
||||
T1e = T1c - T1d;
|
||||
T1U = T1c + T1d;
|
||||
TU = FNMS(KP866025403, TT, TS);
|
||||
TX = FMA(KP866025403, TW, TV);
|
||||
TY = TU - TX;
|
||||
T2o = TU + TX;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tc, Tf, TE, T12, TZ, T10, TH, T11, Th, Tk, TJ, T17, T14, T15, TM;
|
||||
E T16;
|
||||
{
|
||||
E Td, Te, TF, TG;
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
Td = Rm[WS(rs, 4)];
|
||||
Te = Rm[0];
|
||||
Tf = Td + Te;
|
||||
TE = FNMS(KP500000000, Tf, Tc);
|
||||
T12 = Td - Te;
|
||||
TZ = Ip[WS(rs, 3)];
|
||||
TF = Im[WS(rs, 4)];
|
||||
TG = Im[0];
|
||||
T10 = TF + TG;
|
||||
TH = TF - TG;
|
||||
T11 = FMA(KP500000000, T10, TZ);
|
||||
}
|
||||
{
|
||||
E Ti, Tj, TK, TL;
|
||||
Th = Rm[WS(rs, 2)];
|
||||
Ti = Rp[WS(rs, 1)];
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
TJ = FNMS(KP500000000, Tk, Th);
|
||||
T17 = Ti - Tj;
|
||||
T14 = Im[WS(rs, 2)];
|
||||
TK = Ip[WS(rs, 5)];
|
||||
TL = Ip[WS(rs, 1)];
|
||||
T15 = TK + TL;
|
||||
TM = TK - TL;
|
||||
T16 = FMA(KP500000000, T15, T14);
|
||||
}
|
||||
{
|
||||
E Tg, Tl, T1F, T1G;
|
||||
T13 = FMA(KP866025403, T12, T11);
|
||||
T18 = FNMS(KP866025403, T17, T16);
|
||||
T19 = T13 + T18;
|
||||
T1O = FNMS(KP866025403, T12, T11);
|
||||
T1P = FMA(KP866025403, T17, T16);
|
||||
T2E = T1O + T1P;
|
||||
Tg = Tc + Tf;
|
||||
Tl = Th + Tk;
|
||||
Tm = Tg + Tl;
|
||||
T1V = Tg - Tl;
|
||||
T1F = FNMS(KP866025403, TH, TE);
|
||||
T1G = FNMS(KP866025403, TM, TJ);
|
||||
T1H = T1F + T1G;
|
||||
T2z = T1F - T1G;
|
||||
{
|
||||
E T1f, T1g, TI, TN;
|
||||
T1f = TZ - T10;
|
||||
T1g = T15 - T14;
|
||||
T1h = T1f + T1g;
|
||||
T20 = T1f - T1g;
|
||||
TI = FMA(KP866025403, TH, TE);
|
||||
TN = FMA(KP866025403, TM, TJ);
|
||||
TO = TI - TN;
|
||||
T2p = TI + TN;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, T1i, TP, T1a, TQ, T1j, To, T1b, T1k, TR;
|
||||
Tn = Tb + Tm;
|
||||
T1i = T1e + T1h;
|
||||
TP = TD + TO;
|
||||
T1a = TY - T19;
|
||||
To = W[0];
|
||||
TQ = To * TP;
|
||||
T1j = To * T1a;
|
||||
TR = W[1];
|
||||
T1b = FMA(TR, T1a, TQ);
|
||||
T1k = FNMS(TR, TP, T1j);
|
||||
Rp[0] = Tn - T1b;
|
||||
Ip[0] = T1i + T1k;
|
||||
Rm[0] = Tn + T1b;
|
||||
Im[0] = T1k - T1i;
|
||||
}
|
||||
{
|
||||
E T1p, T1l, T1n, T1o, T1x, T1s, T1v, T1t, T1z, T1m, T1r;
|
||||
T1p = T1e - T1h;
|
||||
T1m = Tb - Tm;
|
||||
T1l = W[10];
|
||||
T1n = T1l * T1m;
|
||||
T1o = W[11];
|
||||
T1x = T1o * T1m;
|
||||
T1s = TD - TO;
|
||||
T1v = TY + T19;
|
||||
T1r = W[12];
|
||||
T1t = T1r * T1s;
|
||||
T1z = T1r * T1v;
|
||||
{
|
||||
E T1q, T1y, T1w, T1A, T1u;
|
||||
T1q = FNMS(T1o, T1p, T1n);
|
||||
T1y = FMA(T1l, T1p, T1x);
|
||||
T1u = W[13];
|
||||
T1w = FMA(T1u, T1v, T1t);
|
||||
T1A = FNMS(T1u, T1s, T1z);
|
||||
Rp[WS(rs, 3)] = T1q - T1w;
|
||||
Ip[WS(rs, 3)] = T1y + T1A;
|
||||
Rm[WS(rs, 3)] = T1q + T1w;
|
||||
Im[WS(rs, 3)] = T1A - T1y;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1R, T2b, T27, T29, T2a, T2l, T1B, T1J, T1K, T25, T1W, T21, T1X, T23, T2e;
|
||||
E T2h, T2f, T2j;
|
||||
{
|
||||
E T1N, T1Q, T28, T1I, T1T, T2d;
|
||||
T1N = T1L - T1M;
|
||||
T1Q = T1O - T1P;
|
||||
T1R = T1N - T1Q;
|
||||
T2b = T1N + T1Q;
|
||||
T28 = T1E + T1H;
|
||||
T27 = W[14];
|
||||
T29 = T27 * T28;
|
||||
T2a = W[15];
|
||||
T2l = T2a * T28;
|
||||
T1I = T1E - T1H;
|
||||
T1B = W[2];
|
||||
T1J = T1B * T1I;
|
||||
T1K = W[3];
|
||||
T25 = T1K * T1I;
|
||||
T1W = T1U - T1V;
|
||||
T21 = T1Z + T20;
|
||||
T1T = W[4];
|
||||
T1X = T1T * T1W;
|
||||
T23 = T1T * T21;
|
||||
T2e = T1V + T1U;
|
||||
T2h = T1Z - T20;
|
||||
T2d = W[16];
|
||||
T2f = T2d * T2e;
|
||||
T2j = T2d * T2h;
|
||||
}
|
||||
{
|
||||
E T1S, T26, T22, T24, T1Y;
|
||||
T1S = FNMS(T1K, T1R, T1J);
|
||||
T26 = FMA(T1B, T1R, T25);
|
||||
T1Y = W[5];
|
||||
T22 = FMA(T1Y, T21, T1X);
|
||||
T24 = FNMS(T1Y, T1W, T23);
|
||||
Rp[WS(rs, 1)] = T1S - T22;
|
||||
Ip[WS(rs, 1)] = T24 + T26;
|
||||
Rm[WS(rs, 1)] = T22 + T1S;
|
||||
Im[WS(rs, 1)] = T24 - T26;
|
||||
}
|
||||
{
|
||||
E T2c, T2m, T2i, T2k, T2g;
|
||||
T2c = FNMS(T2a, T2b, T29);
|
||||
T2m = FMA(T27, T2b, T2l);
|
||||
T2g = W[17];
|
||||
T2i = FMA(T2g, T2h, T2f);
|
||||
T2k = FNMS(T2g, T2e, T2j);
|
||||
Rp[WS(rs, 4)] = T2c - T2i;
|
||||
Ip[WS(rs, 4)] = T2k + T2m;
|
||||
Rm[WS(rs, 4)] = T2i + T2c;
|
||||
Im[WS(rs, 4)] = T2k - T2m;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2v, T2P, T2L, T2N, T2O, T2X, T2n, T2r, T2s, T2H, T2A, T2F, T2B, T2J, T2S;
|
||||
E T2V, T2T, T2Z;
|
||||
{
|
||||
E T2t, T2u, T2M, T2q, T2x, T2R;
|
||||
T2t = Tv - TC;
|
||||
T2u = T13 - T18;
|
||||
T2v = T2t + T2u;
|
||||
T2P = T2t - T2u;
|
||||
T2M = T2o - T2p;
|
||||
T2L = W[18];
|
||||
T2N = T2L * T2M;
|
||||
T2O = W[19];
|
||||
T2X = T2O * T2M;
|
||||
T2q = T2o + T2p;
|
||||
T2n = W[6];
|
||||
T2r = T2n * T2q;
|
||||
T2s = W[7];
|
||||
T2H = T2s * T2q;
|
||||
T2A = T2y + T2z;
|
||||
T2F = T2D - T2E;
|
||||
T2x = W[8];
|
||||
T2B = T2x * T2A;
|
||||
T2J = T2x * T2F;
|
||||
T2S = T2y - T2z;
|
||||
T2V = T2D + T2E;
|
||||
T2R = W[20];
|
||||
T2T = T2R * T2S;
|
||||
T2Z = T2R * T2V;
|
||||
}
|
||||
{
|
||||
E T2w, T2I, T2G, T2K, T2C;
|
||||
T2w = FNMS(T2s, T2v, T2r);
|
||||
T2I = FMA(T2n, T2v, T2H);
|
||||
T2C = W[9];
|
||||
T2G = FMA(T2C, T2F, T2B);
|
||||
T2K = FNMS(T2C, T2A, T2J);
|
||||
Rp[WS(rs, 2)] = T2w - T2G;
|
||||
Ip[WS(rs, 2)] = T2I + T2K;
|
||||
Rm[WS(rs, 2)] = T2w + T2G;
|
||||
Im[WS(rs, 2)] = T2K - T2I;
|
||||
}
|
||||
{
|
||||
E T2Q, T2Y, T2W, T30, T2U;
|
||||
T2Q = FNMS(T2O, T2P, T2N);
|
||||
T2Y = FMA(T2L, T2P, T2X);
|
||||
T2U = W[21];
|
||||
T2W = FMA(T2U, T2V, T2T);
|
||||
T30 = FNMS(T2U, T2S, T2Z);
|
||||
Rp[WS(rs, 5)] = T2Q - T2W;
|
||||
Ip[WS(rs, 5)] = T2Y + T30;
|
||||
Rm[WS(rs, 5)] = T2Q + T2W;
|
||||
Im[WS(rs, 5)] = T30 - T2Y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, { 96, 22, 46, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 142 FP additions, 60 FP multiplications,
|
||||
* (or, 112 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E Tv, T1E, TC, T1F, TW, T1x, TT, T1w, T1d, T1N, Tb, T1R, TI, T1z, TN;
|
||||
E T1A, T17, T1I, T12, T1H, T1g, T1S, Tm, T1O;
|
||||
{
|
||||
E T1, Tq, T6, TA, T4, Tp, Tt, TS, T9, Tw, Tz, TV;
|
||||
T1 = Rp[0];
|
||||
Tq = Ip[0];
|
||||
T6 = Rm[WS(rs, 5)];
|
||||
TA = Im[WS(rs, 5)];
|
||||
{
|
||||
E T2, T3, Tr, Ts;
|
||||
T2 = Rp[WS(rs, 4)];
|
||||
T3 = Rm[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
Tp = KP866025403 * (T2 - T3);
|
||||
Tr = Im[WS(rs, 3)];
|
||||
Ts = Ip[WS(rs, 4)];
|
||||
Tt = Tr - Ts;
|
||||
TS = KP866025403 * (Tr + Ts);
|
||||
}
|
||||
{
|
||||
E T7, T8, Tx, Ty;
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
Tw = KP866025403 * (T7 - T8);
|
||||
Tx = Im[WS(rs, 1)];
|
||||
Ty = Ip[WS(rs, 2)];
|
||||
Tz = Tx - Ty;
|
||||
TV = KP866025403 * (Tx + Ty);
|
||||
}
|
||||
{
|
||||
E Tu, TB, TU, TR;
|
||||
Tu = FMA(KP500000000, Tt, Tq);
|
||||
Tv = Tp + Tu;
|
||||
T1E = Tu - Tp;
|
||||
TB = FMS(KP500000000, Tz, TA);
|
||||
TC = Tw + TB;
|
||||
T1F = TB - Tw;
|
||||
TU = FNMS(KP500000000, T9, T6);
|
||||
TW = TU + TV;
|
||||
T1x = TU - TV;
|
||||
TR = FNMS(KP500000000, T4, T1);
|
||||
TT = TR - TS;
|
||||
T1w = TR + TS;
|
||||
{
|
||||
E T1b, T1c, T5, Ta;
|
||||
T1b = Tq - Tt;
|
||||
T1c = Tz + TA;
|
||||
T1d = T1b - T1c;
|
||||
T1N = T1b + T1c;
|
||||
T5 = T1 + T4;
|
||||
Ta = T6 + T9;
|
||||
Tb = T5 + Ta;
|
||||
T1R = T5 - Ta;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tc, T10, Th, T15, Tf, TY, TH, TZ, Tk, T13, TM, T14;
|
||||
Tc = Rp[WS(rs, 3)];
|
||||
T10 = Ip[WS(rs, 3)];
|
||||
Th = Rm[WS(rs, 2)];
|
||||
T15 = Im[WS(rs, 2)];
|
||||
{
|
||||
E Td, Te, TF, TG;
|
||||
Td = Rm[WS(rs, 4)];
|
||||
Te = Rm[0];
|
||||
Tf = Td + Te;
|
||||
TY = KP866025403 * (Td - Te);
|
||||
TF = Im[WS(rs, 4)];
|
||||
TG = Im[0];
|
||||
TH = KP866025403 * (TF - TG);
|
||||
TZ = TF + TG;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, TK, TL;
|
||||
Ti = Rp[WS(rs, 1)];
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Ti + Tj;
|
||||
T13 = KP866025403 * (Ti - Tj);
|
||||
TK = Ip[WS(rs, 5)];
|
||||
TL = Ip[WS(rs, 1)];
|
||||
TM = KP866025403 * (TK - TL);
|
||||
T14 = TK + TL;
|
||||
}
|
||||
{
|
||||
E TE, TJ, T16, T11;
|
||||
TE = FNMS(KP500000000, Tf, Tc);
|
||||
TI = TE + TH;
|
||||
T1z = TE - TH;
|
||||
TJ = FNMS(KP500000000, Tk, Th);
|
||||
TN = TJ + TM;
|
||||
T1A = TJ - TM;
|
||||
T16 = FMA(KP500000000, T14, T15);
|
||||
T17 = T13 - T16;
|
||||
T1I = T13 + T16;
|
||||
T11 = FMA(KP500000000, TZ, T10);
|
||||
T12 = TY + T11;
|
||||
T1H = T11 - TY;
|
||||
{
|
||||
E T1e, T1f, Tg, Tl;
|
||||
T1e = T10 - TZ;
|
||||
T1f = T14 - T15;
|
||||
T1g = T1e + T1f;
|
||||
T1S = T1e - T1f;
|
||||
Tg = Tc + Tf;
|
||||
Tl = Th + Tk;
|
||||
Tm = Tg + Tl;
|
||||
T1O = Tg - Tl;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, T1h, TP, T1p, T19, T1r, T1n, T1t;
|
||||
Tn = Tb + Tm;
|
||||
T1h = T1d + T1g;
|
||||
{
|
||||
E TD, TO, TX, T18;
|
||||
TD = Tv - TC;
|
||||
TO = TI - TN;
|
||||
TP = TD + TO;
|
||||
T1p = TD - TO;
|
||||
TX = TT - TW;
|
||||
T18 = T12 - T17;
|
||||
T19 = TX - T18;
|
||||
T1r = TX + T18;
|
||||
{
|
||||
E T1k, T1m, T1j, T1l;
|
||||
T1k = Tb - Tm;
|
||||
T1m = T1d - T1g;
|
||||
T1j = W[10];
|
||||
T1l = W[11];
|
||||
T1n = FNMS(T1l, T1m, T1j * T1k);
|
||||
T1t = FMA(T1l, T1k, T1j * T1m);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1a, T1i, To, TQ;
|
||||
To = W[0];
|
||||
TQ = W[1];
|
||||
T1a = FMA(To, TP, TQ * T19);
|
||||
T1i = FNMS(TQ, TP, To * T19);
|
||||
Rp[0] = Tn - T1a;
|
||||
Ip[0] = T1h + T1i;
|
||||
Rm[0] = Tn + T1a;
|
||||
Im[0] = T1i - T1h;
|
||||
}
|
||||
{
|
||||
E T1s, T1u, T1o, T1q;
|
||||
T1o = W[12];
|
||||
T1q = W[13];
|
||||
T1s = FMA(T1o, T1p, T1q * T1r);
|
||||
T1u = FNMS(T1q, T1p, T1o * T1r);
|
||||
Rp[WS(rs, 3)] = T1n - T1s;
|
||||
Ip[WS(rs, 3)] = T1t + T1u;
|
||||
Rm[WS(rs, 3)] = T1n + T1s;
|
||||
Im[WS(rs, 3)] = T1u - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1C, T1Y, T1K, T20, T1U, T1V, T26, T27;
|
||||
{
|
||||
E T1y, T1B, T1G, T1J;
|
||||
T1y = T1w + T1x;
|
||||
T1B = T1z + T1A;
|
||||
T1C = T1y - T1B;
|
||||
T1Y = T1y + T1B;
|
||||
T1G = T1E + T1F;
|
||||
T1J = T1H - T1I;
|
||||
T1K = T1G - T1J;
|
||||
T20 = T1G + T1J;
|
||||
}
|
||||
{
|
||||
E T1P, T1T, T1M, T1Q;
|
||||
T1P = T1N - T1O;
|
||||
T1T = T1R + T1S;
|
||||
T1M = W[4];
|
||||
T1Q = W[5];
|
||||
T1U = FMA(T1M, T1P, T1Q * T1T);
|
||||
T1V = FNMS(T1Q, T1P, T1M * T1T);
|
||||
}
|
||||
{
|
||||
E T23, T25, T22, T24;
|
||||
T23 = T1O + T1N;
|
||||
T25 = T1R - T1S;
|
||||
T22 = W[16];
|
||||
T24 = W[17];
|
||||
T26 = FMA(T22, T23, T24 * T25);
|
||||
T27 = FNMS(T24, T23, T22 * T25);
|
||||
}
|
||||
{
|
||||
E T1L, T1W, T1v, T1D;
|
||||
T1v = W[2];
|
||||
T1D = W[3];
|
||||
T1L = FNMS(T1D, T1K, T1v * T1C);
|
||||
T1W = FMA(T1D, T1C, T1v * T1K);
|
||||
Rp[WS(rs, 1)] = T1L - T1U;
|
||||
Ip[WS(rs, 1)] = T1V + T1W;
|
||||
Rm[WS(rs, 1)] = T1U + T1L;
|
||||
Im[WS(rs, 1)] = T1V - T1W;
|
||||
}
|
||||
{
|
||||
E T21, T28, T1X, T1Z;
|
||||
T1X = W[14];
|
||||
T1Z = W[15];
|
||||
T21 = FNMS(T1Z, T20, T1X * T1Y);
|
||||
T28 = FMA(T1Z, T1Y, T1X * T20);
|
||||
Rp[WS(rs, 4)] = T21 - T26;
|
||||
Ip[WS(rs, 4)] = T27 + T28;
|
||||
Rm[WS(rs, 4)] = T26 + T21;
|
||||
Im[WS(rs, 4)] = T27 - T28;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2c, T2u, T2p, T2B, T2g, T2w, T2l, T2z;
|
||||
{
|
||||
E T2a, T2b, T2n, T2o;
|
||||
T2a = TT + TW;
|
||||
T2b = TI + TN;
|
||||
T2c = T2a + T2b;
|
||||
T2u = T2a - T2b;
|
||||
T2n = T1w - T1x;
|
||||
T2o = T1H + T1I;
|
||||
T2p = T2n - T2o;
|
||||
T2B = T2n + T2o;
|
||||
}
|
||||
{
|
||||
E T2e, T2f, T2j, T2k;
|
||||
T2e = Tv + TC;
|
||||
T2f = T12 + T17;
|
||||
T2g = T2e + T2f;
|
||||
T2w = T2e - T2f;
|
||||
T2j = T1E - T1F;
|
||||
T2k = T1z - T1A;
|
||||
T2l = T2j + T2k;
|
||||
T2z = T2j - T2k;
|
||||
}
|
||||
{
|
||||
E T2h, T2r, T2q, T2s;
|
||||
{
|
||||
E T29, T2d, T2i, T2m;
|
||||
T29 = W[6];
|
||||
T2d = W[7];
|
||||
T2h = FNMS(T2d, T2g, T29 * T2c);
|
||||
T2r = FMA(T2d, T2c, T29 * T2g);
|
||||
T2i = W[8];
|
||||
T2m = W[9];
|
||||
T2q = FMA(T2i, T2l, T2m * T2p);
|
||||
T2s = FNMS(T2m, T2l, T2i * T2p);
|
||||
}
|
||||
Rp[WS(rs, 2)] = T2h - T2q;
|
||||
Ip[WS(rs, 2)] = T2r + T2s;
|
||||
Rm[WS(rs, 2)] = T2h + T2q;
|
||||
Im[WS(rs, 2)] = T2s - T2r;
|
||||
}
|
||||
{
|
||||
E T2x, T2D, T2C, T2E;
|
||||
{
|
||||
E T2t, T2v, T2y, T2A;
|
||||
T2t = W[18];
|
||||
T2v = W[19];
|
||||
T2x = FNMS(T2v, T2w, T2t * T2u);
|
||||
T2D = FMA(T2v, T2u, T2t * T2w);
|
||||
T2y = W[20];
|
||||
T2A = W[21];
|
||||
T2C = FMA(T2y, T2z, T2A * T2B);
|
||||
T2E = FNMS(T2A, T2z, T2y * T2B);
|
||||
}
|
||||
Rp[WS(rs, 5)] = T2x - T2C;
|
||||
Ip[WS(rs, 5)] = T2D + T2E;
|
||||
Rm[WS(rs, 5)] = T2x + T2C;
|
||||
Im[WS(rs, 5)] = T2E - T2D;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, { 112, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
892
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_16.c
Normal file
892
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_16.c
Normal file
@@ -0,0 +1,892 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 206 FP additions, 100 FP multiplications,
|
||||
* (or, 136 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 66 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E Tf, T20, T32, T3Q, T3f, T3V, TN, T2a, T1m, T2f, T2G, T3G, T2T, T3L, T1F;
|
||||
E T26, T2J, T2M, T2N, T2U, T2V, T3H, Tu, T25, T3i, T3R, T1a, T2g, T1y, T21;
|
||||
E T39, T3W, T1p, T2b;
|
||||
{
|
||||
E T3, T1e, TA, T1C, T6, Tx, T1h, T1D, Td, T1A, TL, T1k, Ta, T1z, TG;
|
||||
E T1j;
|
||||
{
|
||||
E T1, T2, T1f, T1g;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T1e = T1 - T2;
|
||||
{
|
||||
E Ty, Tz, T4, T5;
|
||||
Ty = Ip[0];
|
||||
Tz = Im[WS(rs, 7)];
|
||||
TA = Ty + Tz;
|
||||
T1C = Ty - Tz;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
Tx = T4 - T5;
|
||||
}
|
||||
T1f = Ip[WS(rs, 4)];
|
||||
T1g = Im[WS(rs, 3)];
|
||||
T1h = T1f + T1g;
|
||||
T1D = T1f - T1g;
|
||||
{
|
||||
E Tb, Tc, TH, TI, TJ, TK;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
TH = Tb - Tc;
|
||||
TI = Im[WS(rs, 1)];
|
||||
TJ = Ip[WS(rs, 6)];
|
||||
TK = TI + TJ;
|
||||
Td = Tb + Tc;
|
||||
T1A = TJ - TI;
|
||||
TL = TH + TK;
|
||||
T1k = TH - TK;
|
||||
}
|
||||
{
|
||||
E T8, T9, TC, TD, TE, TF;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
TC = T8 - T9;
|
||||
TD = Ip[WS(rs, 2)];
|
||||
TE = Im[WS(rs, 5)];
|
||||
TF = TD + TE;
|
||||
Ta = T8 + T9;
|
||||
T1z = TD - TE;
|
||||
TG = TC + TF;
|
||||
T1j = TC - TF;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Te, T30, T31;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T20 = T7 - Te;
|
||||
T30 = TA - Tx;
|
||||
T31 = T1j - T1k;
|
||||
T32 = FMA(KP707106781, T31, T30);
|
||||
T3Q = FNMS(KP707106781, T31, T30);
|
||||
}
|
||||
{
|
||||
E T3d, T3e, TB, TM;
|
||||
T3d = T1e + T1h;
|
||||
T3e = TG + TL;
|
||||
T3f = FNMS(KP707106781, T3e, T3d);
|
||||
T3V = FMA(KP707106781, T3e, T3d);
|
||||
TB = Tx + TA;
|
||||
TM = TG - TL;
|
||||
TN = FMA(KP707106781, TM, TB);
|
||||
T2a = FNMS(KP707106781, TM, TB);
|
||||
}
|
||||
{
|
||||
E T1i, T1l, T2E, T2F;
|
||||
T1i = T1e - T1h;
|
||||
T1l = T1j + T1k;
|
||||
T1m = FMA(KP707106781, T1l, T1i);
|
||||
T2f = FNMS(KP707106781, T1l, T1i);
|
||||
T2E = T3 - T6;
|
||||
T2F = T1A - T1z;
|
||||
T2G = T2E + T2F;
|
||||
T3G = T2E - T2F;
|
||||
}
|
||||
{
|
||||
E T2R, T2S, T1B, T1E;
|
||||
T2R = Ta - Td;
|
||||
T2S = T1C - T1D;
|
||||
T2T = T2R + T2S;
|
||||
T3L = T2S - T2R;
|
||||
T1B = T1z + T1A;
|
||||
T1E = T1C + T1D;
|
||||
T1F = T1B + T1E;
|
||||
T26 = T1E - T1B;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T1s, Tl, T1t, TS, TX, T34, T33, T2I, T2H, Tp, T1v, Ts, T1w, T13;
|
||||
E T18, T37, T36, T2L, T2K;
|
||||
{
|
||||
E TT, TR, TO, TW;
|
||||
{
|
||||
E Tg, Th, TP, TQ;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
TT = Tg - Th;
|
||||
TP = Ip[WS(rs, 1)];
|
||||
TQ = Im[WS(rs, 6)];
|
||||
TR = TP + TQ;
|
||||
T1s = TP - TQ;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TU, TV;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
TO = Tj - Tk;
|
||||
TU = Ip[WS(rs, 5)];
|
||||
TV = Im[WS(rs, 2)];
|
||||
TW = TU + TV;
|
||||
T1t = TU - TV;
|
||||
}
|
||||
TS = TO + TR;
|
||||
TX = TT - TW;
|
||||
T34 = TR - TO;
|
||||
T33 = TT + TW;
|
||||
T2I = T1s - T1t;
|
||||
T2H = Ti - Tl;
|
||||
}
|
||||
{
|
||||
E T14, T12, TZ, T17;
|
||||
{
|
||||
E Tn, To, T10, T11;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T14 = Tn - To;
|
||||
T10 = Im[0];
|
||||
T11 = Ip[WS(rs, 7)];
|
||||
T12 = T10 + T11;
|
||||
T1v = T11 - T10;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T15, T16;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TZ = Tq - Tr;
|
||||
T15 = Ip[WS(rs, 3)];
|
||||
T16 = Im[WS(rs, 4)];
|
||||
T17 = T15 + T16;
|
||||
T1w = T15 - T16;
|
||||
}
|
||||
T13 = TZ - T12;
|
||||
T18 = T14 - T17;
|
||||
T37 = TZ + T12;
|
||||
T36 = T14 + T17;
|
||||
T2L = T1v - T1w;
|
||||
T2K = Tp - Ts;
|
||||
}
|
||||
T2J = T2H - T2I;
|
||||
T2M = T2K + T2L;
|
||||
T2N = T2J + T2M;
|
||||
T2U = T2H + T2I;
|
||||
T2V = T2L - T2K;
|
||||
T3H = T2V - T2U;
|
||||
{
|
||||
E Tm, Tt, T3g, T3h;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T25 = Tm - Tt;
|
||||
T3g = FNMS(KP414213562, T33, T34);
|
||||
T3h = FNMS(KP414213562, T36, T37);
|
||||
T3i = T3g + T3h;
|
||||
T3R = T3h - T3g;
|
||||
}
|
||||
{
|
||||
E TY, T19, T1u, T1x;
|
||||
TY = FMA(KP414213562, TX, TS);
|
||||
T19 = FNMS(KP414213562, T18, T13);
|
||||
T1a = TY + T19;
|
||||
T2g = T19 - TY;
|
||||
T1u = T1s + T1t;
|
||||
T1x = T1v + T1w;
|
||||
T1y = T1u + T1x;
|
||||
T21 = T1x - T1u;
|
||||
}
|
||||
{
|
||||
E T35, T38, T1n, T1o;
|
||||
T35 = FMA(KP414213562, T34, T33);
|
||||
T38 = FMA(KP414213562, T37, T36);
|
||||
T39 = T35 - T38;
|
||||
T3W = T35 + T38;
|
||||
T1n = FNMS(KP414213562, TS, TX);
|
||||
T1o = FMA(KP414213562, T13, T18);
|
||||
T1p = T1n + T1o;
|
||||
T2b = T1n - T1o;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tv, T1G, T1b, T1q, T1c, T1H, Tw, T1r, T1I, T1d;
|
||||
Tv = Tf + Tu;
|
||||
T1G = T1y + T1F;
|
||||
T1b = FMA(KP923879532, T1a, TN);
|
||||
T1q = FMA(KP923879532, T1p, T1m);
|
||||
Tw = W[0];
|
||||
T1c = Tw * T1b;
|
||||
T1H = Tw * T1q;
|
||||
T1d = W[1];
|
||||
T1r = FMA(T1d, T1q, T1c);
|
||||
T1I = FNMS(T1d, T1b, T1H);
|
||||
Rp[0] = Tv - T1r;
|
||||
Ip[0] = T1G + T1I;
|
||||
Rm[0] = Tv + T1r;
|
||||
Im[0] = T1I - T1G;
|
||||
}
|
||||
{
|
||||
E T1N, T1J, T1L, T1M, T1V, T1Q, T1T, T1R, T1X, T1K, T1P;
|
||||
T1N = T1F - T1y;
|
||||
T1K = Tf - Tu;
|
||||
T1J = W[14];
|
||||
T1L = T1J * T1K;
|
||||
T1M = W[15];
|
||||
T1V = T1M * T1K;
|
||||
T1Q = FNMS(KP923879532, T1a, TN);
|
||||
T1T = FNMS(KP923879532, T1p, T1m);
|
||||
T1P = W[16];
|
||||
T1R = T1P * T1Q;
|
||||
T1X = T1P * T1T;
|
||||
{
|
||||
E T1O, T1W, T1U, T1Y, T1S;
|
||||
T1O = FNMS(T1M, T1N, T1L);
|
||||
T1W = FMA(T1J, T1N, T1V);
|
||||
T1S = W[17];
|
||||
T1U = FMA(T1S, T1T, T1R);
|
||||
T1Y = FNMS(T1S, T1Q, T1X);
|
||||
Rp[WS(rs, 4)] = T1O - T1U;
|
||||
Ip[WS(rs, 4)] = T1W + T1Y;
|
||||
Rm[WS(rs, 4)] = T1O + T1U;
|
||||
Im[WS(rs, 4)] = T1Y - T1W;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2r, T2n, T2p, T2q, T2z, T2u, T2x, T2v, T2B, T2o, T2t;
|
||||
T2r = T26 - T25;
|
||||
T2o = T20 - T21;
|
||||
T2n = W[22];
|
||||
T2p = T2n * T2o;
|
||||
T2q = W[23];
|
||||
T2z = T2q * T2o;
|
||||
T2u = FNMS(KP923879532, T2b, T2a);
|
||||
T2x = FNMS(KP923879532, T2g, T2f);
|
||||
T2t = W[24];
|
||||
T2v = T2t * T2u;
|
||||
T2B = T2t * T2x;
|
||||
{
|
||||
E T2s, T2A, T2y, T2C, T2w;
|
||||
T2s = FNMS(T2q, T2r, T2p);
|
||||
T2A = FMA(T2n, T2r, T2z);
|
||||
T2w = W[25];
|
||||
T2y = FMA(T2w, T2x, T2v);
|
||||
T2C = FNMS(T2w, T2u, T2B);
|
||||
Rp[WS(rs, 6)] = T2s - T2y;
|
||||
Ip[WS(rs, 6)] = T2A + T2C;
|
||||
Rm[WS(rs, 6)] = T2s + T2y;
|
||||
Im[WS(rs, 6)] = T2C - T2A;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T27, T1Z, T23, T24, T2j, T2c, T2h, T2d, T2l, T22, T29;
|
||||
T27 = T25 + T26;
|
||||
T22 = T20 + T21;
|
||||
T1Z = W[6];
|
||||
T23 = T1Z * T22;
|
||||
T24 = W[7];
|
||||
T2j = T24 * T22;
|
||||
T2c = FMA(KP923879532, T2b, T2a);
|
||||
T2h = FMA(KP923879532, T2g, T2f);
|
||||
T29 = W[8];
|
||||
T2d = T29 * T2c;
|
||||
T2l = T29 * T2h;
|
||||
{
|
||||
E T28, T2k, T2i, T2m, T2e;
|
||||
T28 = FNMS(T24, T27, T23);
|
||||
T2k = FMA(T1Z, T27, T2j);
|
||||
T2e = W[9];
|
||||
T2i = FMA(T2e, T2h, T2d);
|
||||
T2m = FNMS(T2e, T2c, T2l);
|
||||
Rp[WS(rs, 2)] = T28 - T2i;
|
||||
Ip[WS(rs, 2)] = T2k + T2m;
|
||||
Rm[WS(rs, 2)] = T28 + T2i;
|
||||
Im[WS(rs, 2)] = T2m - T2k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3N, T47, T43, T45, T46, T4f, T3F, T3J, T3K, T3Z, T3S, T3X, T3T, T41, T4a;
|
||||
E T4d, T4b, T4h;
|
||||
{
|
||||
E T3M, T44, T3I, T3P, T49;
|
||||
T3M = T2J - T2M;
|
||||
T3N = FMA(KP707106781, T3M, T3L);
|
||||
T47 = FNMS(KP707106781, T3M, T3L);
|
||||
T44 = FNMS(KP707106781, T3H, T3G);
|
||||
T43 = W[26];
|
||||
T45 = T43 * T44;
|
||||
T46 = W[27];
|
||||
T4f = T46 * T44;
|
||||
T3I = FMA(KP707106781, T3H, T3G);
|
||||
T3F = W[10];
|
||||
T3J = T3F * T3I;
|
||||
T3K = W[11];
|
||||
T3Z = T3K * T3I;
|
||||
T3S = FMA(KP923879532, T3R, T3Q);
|
||||
T3X = FNMS(KP923879532, T3W, T3V);
|
||||
T3P = W[12];
|
||||
T3T = T3P * T3S;
|
||||
T41 = T3P * T3X;
|
||||
T4a = FNMS(KP923879532, T3R, T3Q);
|
||||
T4d = FMA(KP923879532, T3W, T3V);
|
||||
T49 = W[28];
|
||||
T4b = T49 * T4a;
|
||||
T4h = T49 * T4d;
|
||||
}
|
||||
{
|
||||
E T3O, T40, T3Y, T42, T3U;
|
||||
T3O = FNMS(T3K, T3N, T3J);
|
||||
T40 = FMA(T3F, T3N, T3Z);
|
||||
T3U = W[13];
|
||||
T3Y = FMA(T3U, T3X, T3T);
|
||||
T42 = FNMS(T3U, T3S, T41);
|
||||
Rp[WS(rs, 3)] = T3O - T3Y;
|
||||
Ip[WS(rs, 3)] = T40 + T42;
|
||||
Rm[WS(rs, 3)] = T3O + T3Y;
|
||||
Im[WS(rs, 3)] = T42 - T40;
|
||||
}
|
||||
{
|
||||
E T48, T4g, T4e, T4i, T4c;
|
||||
T48 = FNMS(T46, T47, T45);
|
||||
T4g = FMA(T43, T47, T4f);
|
||||
T4c = W[29];
|
||||
T4e = FMA(T4c, T4d, T4b);
|
||||
T4i = FNMS(T4c, T4a, T4h);
|
||||
Rp[WS(rs, 7)] = T48 - T4e;
|
||||
Ip[WS(rs, 7)] = T4g + T4i;
|
||||
Rm[WS(rs, 7)] = T48 + T4e;
|
||||
Im[WS(rs, 7)] = T4i - T4g;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2X, T3t, T3p, T3r, T3s, T3B, T2D, T2P, T2Q, T3l, T3a, T3j, T3b, T3n, T3w;
|
||||
E T3z, T3x, T3D;
|
||||
{
|
||||
E T2W, T3q, T2O, T2Z, T3v;
|
||||
T2W = T2U + T2V;
|
||||
T2X = FMA(KP707106781, T2W, T2T);
|
||||
T3t = FNMS(KP707106781, T2W, T2T);
|
||||
T3q = FNMS(KP707106781, T2N, T2G);
|
||||
T3p = W[18];
|
||||
T3r = T3p * T3q;
|
||||
T3s = W[19];
|
||||
T3B = T3s * T3q;
|
||||
T2O = FMA(KP707106781, T2N, T2G);
|
||||
T2D = W[2];
|
||||
T2P = T2D * T2O;
|
||||
T2Q = W[3];
|
||||
T3l = T2Q * T2O;
|
||||
T3a = FMA(KP923879532, T39, T32);
|
||||
T3j = FNMS(KP923879532, T3i, T3f);
|
||||
T2Z = W[4];
|
||||
T3b = T2Z * T3a;
|
||||
T3n = T2Z * T3j;
|
||||
T3w = FNMS(KP923879532, T39, T32);
|
||||
T3z = FMA(KP923879532, T3i, T3f);
|
||||
T3v = W[20];
|
||||
T3x = T3v * T3w;
|
||||
T3D = T3v * T3z;
|
||||
}
|
||||
{
|
||||
E T2Y, T3m, T3k, T3o, T3c;
|
||||
T2Y = FNMS(T2Q, T2X, T2P);
|
||||
T3m = FMA(T2D, T2X, T3l);
|
||||
T3c = W[5];
|
||||
T3k = FMA(T3c, T3j, T3b);
|
||||
T3o = FNMS(T3c, T3a, T3n);
|
||||
Rp[WS(rs, 1)] = T2Y - T3k;
|
||||
Ip[WS(rs, 1)] = T3m + T3o;
|
||||
Rm[WS(rs, 1)] = T2Y + T3k;
|
||||
Im[WS(rs, 1)] = T3o - T3m;
|
||||
}
|
||||
{
|
||||
E T3u, T3C, T3A, T3E, T3y;
|
||||
T3u = FNMS(T3s, T3t, T3r);
|
||||
T3C = FMA(T3p, T3t, T3B);
|
||||
T3y = W[21];
|
||||
T3A = FMA(T3y, T3z, T3x);
|
||||
T3E = FNMS(T3y, T3w, T3D);
|
||||
Rp[WS(rs, 5)] = T3u - T3A;
|
||||
Ip[WS(rs, 5)] = T3C + T3E;
|
||||
Rm[WS(rs, 5)] = T3u + T3A;
|
||||
Im[WS(rs, 5)] = T3E - T3C;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, { 136, 30, 70, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 206 FP additions, 84 FP multiplications,
|
||||
* (or, 168 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 60 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
|
||||
E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
|
||||
E T1x, T1V;
|
||||
{
|
||||
E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
|
||||
E T1z;
|
||||
{
|
||||
E T1, T2, Ty, Tz;
|
||||
T1 = Rp[0];
|
||||
T2 = Rm[WS(rs, 7)];
|
||||
T3 = T1 + T2;
|
||||
T1j = T1 - T2;
|
||||
Ty = Ip[0];
|
||||
Tz = Im[WS(rs, 7)];
|
||||
TA = Ty + Tz;
|
||||
T1B = Ty - Tz;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1k, T1l;
|
||||
T4 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
Tx = T4 - T5;
|
||||
T1k = Ip[WS(rs, 4)];
|
||||
T1l = Im[WS(rs, 3)];
|
||||
T1m = T1k + T1l;
|
||||
T1C = T1k - T1l;
|
||||
}
|
||||
{
|
||||
E T8, T9, TD, TE;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = Rm[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TC = T8 - T9;
|
||||
TD = Ip[WS(rs, 2)];
|
||||
TE = Im[WS(rs, 5)];
|
||||
TF = TD + TE;
|
||||
T1y = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TI, TJ;
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Rp[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
TH = Tb - Tc;
|
||||
TI = Im[WS(rs, 1)];
|
||||
TJ = Ip[WS(rs, 6)];
|
||||
TK = TI + TJ;
|
||||
T1z = TJ - TI;
|
||||
}
|
||||
{
|
||||
E T7, Te, TG, TL;
|
||||
TB = Tx + TA;
|
||||
T2L = TA - Tx;
|
||||
T30 = T1j + T1m;
|
||||
T1n = T1j - T1m;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
T1U = T7 - Te;
|
||||
{
|
||||
E T2F, T2G, T1A, T1D;
|
||||
T2F = Ta - Td;
|
||||
T2G = T1B - T1C;
|
||||
T2H = T2F + T2G;
|
||||
T3p = T2G - T2F;
|
||||
T1A = T1y + T1z;
|
||||
T1D = T1B + T1C;
|
||||
T1E = T1A + T1D;
|
||||
T1Z = T1D - T1A;
|
||||
}
|
||||
TG = TC + TF;
|
||||
TL = TH + TK;
|
||||
TM = KP707106781 * (TG - TL);
|
||||
T31 = KP707106781 * (TG + TL);
|
||||
{
|
||||
E T2q, T2r, T1g, T1h;
|
||||
T2q = T3 - T6;
|
||||
T2r = T1z - T1y;
|
||||
T2s = T2q + T2r;
|
||||
T3k = T2q - T2r;
|
||||
T1g = TC - TF;
|
||||
T1h = TH - TK;
|
||||
T1i = KP707106781 * (T1g + T1h);
|
||||
T2M = KP707106781 * (T1g - T1h);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
|
||||
E T1v;
|
||||
{
|
||||
E Tg, Th, TP, TQ;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Rm[WS(rs, 6)];
|
||||
Ti = Tg + Th;
|
||||
TT = Tg - Th;
|
||||
TP = Ip[WS(rs, 1)];
|
||||
TQ = Im[WS(rs, 6)];
|
||||
TR = TP + TQ;
|
||||
T1r = TP - TQ;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, TU, TV;
|
||||
Tj = Rp[WS(rs, 5)];
|
||||
Tk = Rm[WS(rs, 2)];
|
||||
Tl = Tj + Tk;
|
||||
TO = Tj - Tk;
|
||||
TU = Ip[WS(rs, 5)];
|
||||
TV = Im[WS(rs, 2)];
|
||||
TW = TU + TV;
|
||||
T1s = TU - TV;
|
||||
}
|
||||
{
|
||||
E Tn, To, T10, T11;
|
||||
Tn = Rm[0];
|
||||
To = Rp[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
T14 = Tn - To;
|
||||
T10 = Im[0];
|
||||
T11 = Ip[WS(rs, 7)];
|
||||
T12 = T10 + T11;
|
||||
T1u = T11 - T10;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T15, T16;
|
||||
Tq = Rp[WS(rs, 3)];
|
||||
Tr = Rm[WS(rs, 4)];
|
||||
Ts = Tq + Tr;
|
||||
TZ = Tq - Tr;
|
||||
T15 = Ip[WS(rs, 3)];
|
||||
T16 = Im[WS(rs, 4)];
|
||||
T17 = T15 + T16;
|
||||
T1v = T15 - T16;
|
||||
}
|
||||
{
|
||||
E Tm, Tt, T2O, T2P;
|
||||
Tm = Ti + Tl;
|
||||
Tt = Tp + Ts;
|
||||
Tu = Tm + Tt;
|
||||
T1Y = Tm - Tt;
|
||||
T2O = TR - TO;
|
||||
T2P = TT + TW;
|
||||
T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
|
||||
T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
|
||||
}
|
||||
{
|
||||
E T2R, T2S, TS, TX;
|
||||
T2R = TZ + T12;
|
||||
T2S = T14 + T17;
|
||||
T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
|
||||
T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
|
||||
TS = TO + TR;
|
||||
TX = TT - TW;
|
||||
TY = FMA(KP923879532, TS, KP382683432 * TX);
|
||||
T1d = FNMS(KP382683432, TS, KP923879532 * TX);
|
||||
}
|
||||
{
|
||||
E T13, T18, T2t, T2u;
|
||||
T13 = TZ - T12;
|
||||
T18 = T14 - T17;
|
||||
T19 = FNMS(KP382683432, T18, KP923879532 * T13);
|
||||
T1e = FMA(KP382683432, T13, KP923879532 * T18);
|
||||
T2t = Ti - Tl;
|
||||
T2u = T1r - T1s;
|
||||
T2v = T2t - T2u;
|
||||
T2C = T2t + T2u;
|
||||
}
|
||||
{
|
||||
E T2w, T2x, T1t, T1w;
|
||||
T2w = Tp - Ts;
|
||||
T2x = T1u - T1v;
|
||||
T2y = T2w + T2x;
|
||||
T2D = T2x - T2w;
|
||||
T1t = T1r + T1s;
|
||||
T1w = T1u + T1v;
|
||||
T1x = T1t + T1w;
|
||||
T1V = T1w - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
|
||||
Tv = Tf + Tu;
|
||||
T1F = T1x + T1E;
|
||||
{
|
||||
E TN, T1a, T1f, T1o;
|
||||
TN = TB + TM;
|
||||
T1a = TY + T19;
|
||||
T1b = TN + T1a;
|
||||
T1N = TN - T1a;
|
||||
T1f = T1d + T1e;
|
||||
T1o = T1i + T1n;
|
||||
T1p = T1f + T1o;
|
||||
T1P = T1o - T1f;
|
||||
{
|
||||
E T1I, T1K, T1H, T1J;
|
||||
T1I = Tf - Tu;
|
||||
T1K = T1E - T1x;
|
||||
T1H = W[14];
|
||||
T1J = W[15];
|
||||
T1L = FNMS(T1J, T1K, T1H * T1I);
|
||||
T1R = FMA(T1J, T1I, T1H * T1K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1q, T1G, Tw, T1c;
|
||||
Tw = W[0];
|
||||
T1c = W[1];
|
||||
T1q = FMA(Tw, T1b, T1c * T1p);
|
||||
T1G = FNMS(T1c, T1b, Tw * T1p);
|
||||
Rp[0] = Tv - T1q;
|
||||
Ip[0] = T1F + T1G;
|
||||
Rm[0] = Tv + T1q;
|
||||
Im[0] = T1G - T1F;
|
||||
}
|
||||
{
|
||||
E T1Q, T1S, T1M, T1O;
|
||||
T1M = W[16];
|
||||
T1O = W[17];
|
||||
T1Q = FMA(T1M, T1N, T1O * T1P);
|
||||
T1S = FNMS(T1O, T1N, T1M * T1P);
|
||||
Rp[WS(rs, 4)] = T1L - T1Q;
|
||||
Ip[WS(rs, 4)] = T1R + T1S;
|
||||
Rm[WS(rs, 4)] = T1L + T1Q;
|
||||
Im[WS(rs, 4)] = T1S - T1R;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
|
||||
{
|
||||
E T23, T24, T27, T28;
|
||||
T23 = TB - TM;
|
||||
T24 = T1d - T1e;
|
||||
T25 = T23 + T24;
|
||||
T2j = T23 - T24;
|
||||
T27 = T19 - TY;
|
||||
T28 = T1n - T1i;
|
||||
T29 = T27 + T28;
|
||||
T2l = T28 - T27;
|
||||
}
|
||||
{
|
||||
E T1W, T20, T1T, T1X;
|
||||
T1W = T1U + T1V;
|
||||
T20 = T1Y + T1Z;
|
||||
T1T = W[6];
|
||||
T1X = W[7];
|
||||
T21 = FNMS(T1X, T20, T1T * T1W);
|
||||
T2b = FMA(T1X, T1W, T1T * T20);
|
||||
}
|
||||
{
|
||||
E T2e, T2g, T2d, T2f;
|
||||
T2e = T1U - T1V;
|
||||
T2g = T1Z - T1Y;
|
||||
T2d = W[22];
|
||||
T2f = W[23];
|
||||
T2h = FNMS(T2f, T2g, T2d * T2e);
|
||||
T2n = FMA(T2f, T2e, T2d * T2g);
|
||||
}
|
||||
{
|
||||
E T2a, T2c, T22, T26;
|
||||
T22 = W[8];
|
||||
T26 = W[9];
|
||||
T2a = FMA(T22, T25, T26 * T29);
|
||||
T2c = FNMS(T26, T25, T22 * T29);
|
||||
Rp[WS(rs, 2)] = T21 - T2a;
|
||||
Ip[WS(rs, 2)] = T2b + T2c;
|
||||
Rm[WS(rs, 2)] = T21 + T2a;
|
||||
Im[WS(rs, 2)] = T2c - T2b;
|
||||
}
|
||||
{
|
||||
E T2m, T2o, T2i, T2k;
|
||||
T2i = W[24];
|
||||
T2k = W[25];
|
||||
T2m = FMA(T2i, T2j, T2k * T2l);
|
||||
T2o = FNMS(T2k, T2j, T2i * T2l);
|
||||
Rp[WS(rs, 6)] = T2h - T2m;
|
||||
Ip[WS(rs, 6)] = T2n + T2o;
|
||||
Rm[WS(rs, 6)] = T2h + T2m;
|
||||
Im[WS(rs, 6)] = T2o - T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
|
||||
T2z = KP707106781 * (T2v + T2y);
|
||||
T2A = T2s + T2z;
|
||||
T38 = T2s - T2z;
|
||||
T2E = KP707106781 * (T2C + T2D);
|
||||
T2I = T2E + T2H;
|
||||
T3a = T2H - T2E;
|
||||
{
|
||||
E T2N, T2U, T2Z, T32;
|
||||
T2N = T2L + T2M;
|
||||
T2U = T2Q - T2T;
|
||||
T2V = T2N + T2U;
|
||||
T3d = T2N - T2U;
|
||||
T2Z = T2X + T2Y;
|
||||
T32 = T30 - T31;
|
||||
T33 = T2Z + T32;
|
||||
T3f = T32 - T2Z;
|
||||
}
|
||||
{
|
||||
E T2J, T35, T34, T36;
|
||||
{
|
||||
E T2p, T2B, T2K, T2W;
|
||||
T2p = W[2];
|
||||
T2B = W[3];
|
||||
T2J = FNMS(T2B, T2I, T2p * T2A);
|
||||
T35 = FMA(T2B, T2A, T2p * T2I);
|
||||
T2K = W[4];
|
||||
T2W = W[5];
|
||||
T34 = FMA(T2K, T2V, T2W * T33);
|
||||
T36 = FNMS(T2W, T2V, T2K * T33);
|
||||
}
|
||||
Rp[WS(rs, 1)] = T2J - T34;
|
||||
Ip[WS(rs, 1)] = T35 + T36;
|
||||
Rm[WS(rs, 1)] = T2J + T34;
|
||||
Im[WS(rs, 1)] = T36 - T35;
|
||||
}
|
||||
{
|
||||
E T3b, T3h, T3g, T3i;
|
||||
{
|
||||
E T37, T39, T3c, T3e;
|
||||
T37 = W[18];
|
||||
T39 = W[19];
|
||||
T3b = FNMS(T39, T3a, T37 * T38);
|
||||
T3h = FMA(T39, T38, T37 * T3a);
|
||||
T3c = W[20];
|
||||
T3e = W[21];
|
||||
T3g = FMA(T3c, T3d, T3e * T3f);
|
||||
T3i = FNMS(T3e, T3d, T3c * T3f);
|
||||
}
|
||||
Rp[WS(rs, 5)] = T3b - T3g;
|
||||
Ip[WS(rs, 5)] = T3h + T3i;
|
||||
Rm[WS(rs, 5)] = T3b + T3g;
|
||||
Im[WS(rs, 5)] = T3i - T3h;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
|
||||
T3l = KP707106781 * (T2D - T2C);
|
||||
T3m = T3k + T3l;
|
||||
T3E = T3k - T3l;
|
||||
T3o = KP707106781 * (T2v - T2y);
|
||||
T3q = T3o + T3p;
|
||||
T3G = T3p - T3o;
|
||||
{
|
||||
E T3t, T3u, T3x, T3y;
|
||||
T3t = T2L - T2M;
|
||||
T3u = T2X - T2Y;
|
||||
T3v = T3t + T3u;
|
||||
T3J = T3t - T3u;
|
||||
T3x = T31 + T30;
|
||||
T3y = T2Q + T2T;
|
||||
T3z = T3x - T3y;
|
||||
T3L = T3y + T3x;
|
||||
}
|
||||
{
|
||||
E T3r, T3B, T3A, T3C;
|
||||
{
|
||||
E T3j, T3n, T3s, T3w;
|
||||
T3j = W[10];
|
||||
T3n = W[11];
|
||||
T3r = FNMS(T3n, T3q, T3j * T3m);
|
||||
T3B = FMA(T3n, T3m, T3j * T3q);
|
||||
T3s = W[12];
|
||||
T3w = W[13];
|
||||
T3A = FMA(T3s, T3v, T3w * T3z);
|
||||
T3C = FNMS(T3w, T3v, T3s * T3z);
|
||||
}
|
||||
Rp[WS(rs, 3)] = T3r - T3A;
|
||||
Ip[WS(rs, 3)] = T3B + T3C;
|
||||
Rm[WS(rs, 3)] = T3r + T3A;
|
||||
Im[WS(rs, 3)] = T3C - T3B;
|
||||
}
|
||||
{
|
||||
E T3H, T3N, T3M, T3O;
|
||||
{
|
||||
E T3D, T3F, T3I, T3K;
|
||||
T3D = W[26];
|
||||
T3F = W[27];
|
||||
T3H = FNMS(T3F, T3G, T3D * T3E);
|
||||
T3N = FMA(T3F, T3E, T3D * T3G);
|
||||
T3I = W[28];
|
||||
T3K = W[29];
|
||||
T3M = FMA(T3I, T3J, T3K * T3L);
|
||||
T3O = FNMS(T3K, T3J, T3I * T3L);
|
||||
}
|
||||
Rp[WS(rs, 7)] = T3H - T3M;
|
||||
Ip[WS(rs, 7)] = T3N + T3O;
|
||||
Rm[WS(rs, 7)] = T3H + T3M;
|
||||
Im[WS(rs, 7)] = T3O - T3N;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, { 168, 46, 38, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
131
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_2.c
Normal file
131
fftw-3.3.10/rdft/scalar/r2cb/hc2cbdft_2.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cbdft_2 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 10 FP additions, 4 FP multiplications,
|
||||
* (or, 8 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 15 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T3, Ta, Tc, T9, Td, T4, T8, Tb, Te;
|
||||
{
|
||||
E T1, T2, T5, T6, T7;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
T3 = T1 - T2;
|
||||
Ta = T1 + T2;
|
||||
T5 = Rp[0];
|
||||
T6 = Rm[0];
|
||||
T7 = T5 - T6;
|
||||
Tc = T5 + T6;
|
||||
T9 = W[1];
|
||||
Td = T9 * T7;
|
||||
T4 = W[0];
|
||||
T8 = T4 * T7;
|
||||
}
|
||||
Tb = FNMS(T9, Ta, T8);
|
||||
Ip[0] = T3 + Tb;
|
||||
Im[0] = Tb - T3;
|
||||
Te = FMA(T4, Ta, Td);
|
||||
Rp[0] = Tc - Te;
|
||||
Rm[0] = Tc + Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cbdft_2", twinstr, &GENUS, { 8, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cbdft_2 -include rdft/scalar/hc2cb.h */
|
||||
|
||||
/*
|
||||
* This function contains 10 FP additions, 4 FP multiplications,
|
||||
* (or, 8 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cb.h"
|
||||
|
||||
static void hc2cbdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T3, T9, T7, Tb;
|
||||
{
|
||||
E T1, T2, T5, T6;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
T3 = T1 - T2;
|
||||
T9 = T1 + T2;
|
||||
T5 = Rp[0];
|
||||
T6 = Rm[0];
|
||||
T7 = T5 - T6;
|
||||
Tb = T5 + T6;
|
||||
}
|
||||
{
|
||||
E Ta, Tc, T4, T8;
|
||||
T4 = W[0];
|
||||
T8 = W[1];
|
||||
Ta = FNMS(T8, T9, T4 * T7);
|
||||
Tc = FMA(T8, T7, T4 * T9);
|
||||
Ip[0] = T3 + Ta;
|
||||
Rp[0] = Tb - Tc;
|
||||
Im[0] = Ta - T3;
|
||||
Rm[0] = Tb + Tc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cbdft_2", twinstr, &GENUS, { 8, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cbdft_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdft_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user