This commit is contained in:
2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = scalar simd
noinst_LTLIBRARIES = librdft.la
RDFT2 = buffered2.c direct2.c nop2.c rank0-rdft2.c rank-geq2-rdft2.c \
plan2.c problem2.c solve2.c vrank-geq1-rdft2.c rdft2-rdft.c \
rdft2-tensor-max-index.c rdft2-inplace-strides.c rdft2-strides.c \
khc2c.c ct-hc2c.h ct-hc2c.c ct-hc2c-direct.c
librdft_la_SOURCES = hc2hc.h hc2hc.c dft-r2hc.c dht-r2hc.c dht-rader.c \
buffered.c codelet-rdft.h conf.c direct-r2r.c direct-r2c.c generic.c \
hc2hc-direct.c hc2hc-generic.c khc2hc.c kr2c.c kr2r.c indirect.c nop.c \
plan.c problem.c rank0.c rank-geq2.c rdft.h rdft-dht.c solve.c \
vrank-geq1.c vrank3-transpose.c $(RDFT2)

View File

@@ -0,0 +1,910 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = rdft
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
librdft_la_LIBADD =
am__objects_1 = buffered2.lo direct2.lo nop2.lo rank0-rdft2.lo \
rank-geq2-rdft2.lo plan2.lo problem2.lo solve2.lo \
vrank-geq1-rdft2.lo rdft2-rdft.lo rdft2-tensor-max-index.lo \
rdft2-inplace-strides.lo rdft2-strides.lo khc2c.lo ct-hc2c.lo \
ct-hc2c-direct.lo
am_librdft_la_OBJECTS = hc2hc.lo dft-r2hc.lo dht-r2hc.lo dht-rader.lo \
buffered.lo conf.lo direct-r2r.lo direct-r2c.lo generic.lo \
hc2hc-direct.lo hc2hc-generic.lo khc2hc.lo kr2c.lo kr2r.lo \
indirect.lo nop.lo plan.lo problem.lo rank0.lo rank-geq2.lo \
rdft-dht.lo solve.lo vrank-geq1.lo vrank3-transpose.lo \
$(am__objects_1)
librdft_la_OBJECTS = $(am_librdft_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/buffered.Plo \
./$(DEPDIR)/buffered2.Plo ./$(DEPDIR)/conf.Plo \
./$(DEPDIR)/ct-hc2c-direct.Plo ./$(DEPDIR)/ct-hc2c.Plo \
./$(DEPDIR)/dft-r2hc.Plo ./$(DEPDIR)/dht-r2hc.Plo \
./$(DEPDIR)/dht-rader.Plo ./$(DEPDIR)/direct-r2c.Plo \
./$(DEPDIR)/direct-r2r.Plo ./$(DEPDIR)/direct2.Plo \
./$(DEPDIR)/generic.Plo ./$(DEPDIR)/hc2hc-direct.Plo \
./$(DEPDIR)/hc2hc-generic.Plo ./$(DEPDIR)/hc2hc.Plo \
./$(DEPDIR)/indirect.Plo ./$(DEPDIR)/khc2c.Plo \
./$(DEPDIR)/khc2hc.Plo ./$(DEPDIR)/kr2c.Plo \
./$(DEPDIR)/kr2r.Plo ./$(DEPDIR)/nop.Plo ./$(DEPDIR)/nop2.Plo \
./$(DEPDIR)/plan.Plo ./$(DEPDIR)/plan2.Plo \
./$(DEPDIR)/problem.Plo ./$(DEPDIR)/problem2.Plo \
./$(DEPDIR)/rank-geq2-rdft2.Plo ./$(DEPDIR)/rank-geq2.Plo \
./$(DEPDIR)/rank0-rdft2.Plo ./$(DEPDIR)/rank0.Plo \
./$(DEPDIR)/rdft-dht.Plo ./$(DEPDIR)/rdft2-inplace-strides.Plo \
./$(DEPDIR)/rdft2-rdft.Plo ./$(DEPDIR)/rdft2-strides.Plo \
./$(DEPDIR)/rdft2-tensor-max-index.Plo ./$(DEPDIR)/solve.Plo \
./$(DEPDIR)/solve2.Plo ./$(DEPDIR)/vrank-geq1-rdft2.Plo \
./$(DEPDIR)/vrank-geq1.Plo ./$(DEPDIR)/vrank3-transpose.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(librdft_la_SOURCES)
DIST_SOURCES = $(librdft_la_SOURCES)
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
ctags-recursive dvi-recursive html-recursive info-recursive \
install-data-recursive install-dvi-recursive \
install-exec-recursive install-html-recursive \
install-info-recursive install-pdf-recursive \
install-ps-recursive install-recursive installcheck-recursive \
installdirs-recursive pdf-recursive ps-recursive \
tags-recursive uninstall-recursive
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
distclean-recursive maintainer-clean-recursive
am__recursive_targets = \
$(RECURSIVE_TARGETS) \
$(RECURSIVE_CLEAN_TARGETS) \
$(am__extra_recursive_targets)
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
distdir distdir-am
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
am__relativize = \
dir0=`pwd`; \
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
sed_rest='s,^[^/]*/*,,'; \
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
sed_butlast='s,/*[^/]*$$,,'; \
while test -n "$$dir1"; do \
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
if test "$$first" != "."; then \
if test "$$first" = ".."; then \
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
else \
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
if test "$$first2" = "$$first"; then \
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
else \
dir2="../$$dir2"; \
fi; \
dir0="$$dir0"/"$$first"; \
fi; \
fi; \
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
done; \
reldir="$$dir2"
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = scalar simd
noinst_LTLIBRARIES = librdft.la
RDFT2 = buffered2.c direct2.c nop2.c rank0-rdft2.c rank-geq2-rdft2.c \
plan2.c problem2.c solve2.c vrank-geq1-rdft2.c rdft2-rdft.c \
rdft2-tensor-max-index.c rdft2-inplace-strides.c rdft2-strides.c \
khc2c.c ct-hc2c.h ct-hc2c.c ct-hc2c-direct.c
librdft_la_SOURCES = hc2hc.h hc2hc.c dft-r2hc.c dht-r2hc.c dht-rader.c \
buffered.c codelet-rdft.h conf.c direct-r2r.c direct-r2c.c generic.c \
hc2hc-direct.c hc2hc-generic.c khc2hc.c kr2c.c kr2r.c indirect.c nop.c \
plan.c problem.c rank0.c rank-geq2.c rdft.h rdft-dht.c solve.c \
vrank-geq1.c vrank3-transpose.c $(RDFT2)
all: all-recursive
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu rdft/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
librdft.la: $(librdft_la_OBJECTS) $(librdft_la_DEPENDENCIES) $(EXTRA_librdft_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(librdft_la_OBJECTS) $(librdft_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffered2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/conf.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct-hc2c-direct.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ct-hc2c.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dft-r2hc.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dht-r2hc.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dht-rader.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct-r2c.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct-r2r.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/direct2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/generic.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc-direct.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc-generic.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2hc.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/indirect.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/khc2c.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/khc2hc.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kr2c.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kr2r.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nop2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/plan2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2-rdft2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank-geq2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank0-rdft2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rank0.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft-dht.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-inplace-strides.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-rdft.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-strides.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rdft2-tensor-max-index.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/solve2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1-rdft2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank-geq1.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vrank3-transpose.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
# This directory's subdirectories are mostly independent; you can cd
# into them and run 'make' without going through this Makefile.
# To change the values of 'make' variables: instead of editing Makefiles,
# (1) if the variable is set in 'config.status', edit 'config.status'
# (which will cause the Makefiles to be regenerated when you run 'make');
# (2) otherwise, pass the desired values on the 'make' command line.
$(am__recursive_targets):
@fail=; \
if $(am__make_keepgoing); then \
failcom='fail=yes'; \
else \
failcom='exit 1'; \
fi; \
dot_seen=no; \
target=`echo $@ | sed s/-recursive//`; \
case "$@" in \
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
*) list='$(SUBDIRS)' ;; \
esac; \
for subdir in $$list; do \
echo "Making $$target in $$subdir"; \
if test "$$subdir" = "."; then \
dot_seen=yes; \
local_target="$$target-am"; \
else \
local_target="$$target"; \
fi; \
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|| eval $$failcom; \
done; \
if test "$$dot_seen" = "no"; then \
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
fi; test -z "$$fail"
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-recursive
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
include_option=--etags-include; \
empty_fix=.; \
else \
include_option=--include; \
empty_fix=; \
fi; \
list='$(SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
test ! -f $$subdir/TAGS || \
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
fi; \
done; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-recursive
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-recursive
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
$(am__make_dryrun) \
|| test -d "$(distdir)/$$subdir" \
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|| exit 1; \
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
$(am__relativize); \
new_distdir=$$reldir; \
dir1=$$subdir; dir2="$(top_distdir)"; \
$(am__relativize); \
new_top_distdir=$$reldir; \
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
($(am__cd) $$subdir && \
$(MAKE) $(AM_MAKEFLAGS) \
top_distdir="$$new_top_distdir" \
distdir="$$new_distdir" \
am__remove_distdir=: \
am__skip_length_check=: \
am__skip_mode_fix=: \
distdir) \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-recursive
all-am: Makefile $(LTLIBRARIES)
installdirs: installdirs-recursive
installdirs-am:
install: install-recursive
install-exec: install-exec-recursive
install-data: install-data-recursive
uninstall: uninstall-recursive
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-recursive
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-recursive
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-recursive
-rm -f ./$(DEPDIR)/buffered.Plo
-rm -f ./$(DEPDIR)/buffered2.Plo
-rm -f ./$(DEPDIR)/conf.Plo
-rm -f ./$(DEPDIR)/ct-hc2c-direct.Plo
-rm -f ./$(DEPDIR)/ct-hc2c.Plo
-rm -f ./$(DEPDIR)/dft-r2hc.Plo
-rm -f ./$(DEPDIR)/dht-r2hc.Plo
-rm -f ./$(DEPDIR)/dht-rader.Plo
-rm -f ./$(DEPDIR)/direct-r2c.Plo
-rm -f ./$(DEPDIR)/direct-r2r.Plo
-rm -f ./$(DEPDIR)/direct2.Plo
-rm -f ./$(DEPDIR)/generic.Plo
-rm -f ./$(DEPDIR)/hc2hc-direct.Plo
-rm -f ./$(DEPDIR)/hc2hc-generic.Plo
-rm -f ./$(DEPDIR)/hc2hc.Plo
-rm -f ./$(DEPDIR)/indirect.Plo
-rm -f ./$(DEPDIR)/khc2c.Plo
-rm -f ./$(DEPDIR)/khc2hc.Plo
-rm -f ./$(DEPDIR)/kr2c.Plo
-rm -f ./$(DEPDIR)/kr2r.Plo
-rm -f ./$(DEPDIR)/nop.Plo
-rm -f ./$(DEPDIR)/nop2.Plo
-rm -f ./$(DEPDIR)/plan.Plo
-rm -f ./$(DEPDIR)/plan2.Plo
-rm -f ./$(DEPDIR)/problem.Plo
-rm -f ./$(DEPDIR)/problem2.Plo
-rm -f ./$(DEPDIR)/rank-geq2-rdft2.Plo
-rm -f ./$(DEPDIR)/rank-geq2.Plo
-rm -f ./$(DEPDIR)/rank0-rdft2.Plo
-rm -f ./$(DEPDIR)/rank0.Plo
-rm -f ./$(DEPDIR)/rdft-dht.Plo
-rm -f ./$(DEPDIR)/rdft2-inplace-strides.Plo
-rm -f ./$(DEPDIR)/rdft2-rdft.Plo
-rm -f ./$(DEPDIR)/rdft2-strides.Plo
-rm -f ./$(DEPDIR)/rdft2-tensor-max-index.Plo
-rm -f ./$(DEPDIR)/solve.Plo
-rm -f ./$(DEPDIR)/solve2.Plo
-rm -f ./$(DEPDIR)/vrank-geq1-rdft2.Plo
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
-rm -f ./$(DEPDIR)/vrank3-transpose.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-recursive
dvi-am:
html: html-recursive
html-am:
info: info-recursive
info-am:
install-data-am:
install-dvi: install-dvi-recursive
install-dvi-am:
install-exec-am:
install-html: install-html-recursive
install-html-am:
install-info: install-info-recursive
install-info-am:
install-man:
install-pdf: install-pdf-recursive
install-pdf-am:
install-ps: install-ps-recursive
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-recursive
-rm -f ./$(DEPDIR)/buffered.Plo
-rm -f ./$(DEPDIR)/buffered2.Plo
-rm -f ./$(DEPDIR)/conf.Plo
-rm -f ./$(DEPDIR)/ct-hc2c-direct.Plo
-rm -f ./$(DEPDIR)/ct-hc2c.Plo
-rm -f ./$(DEPDIR)/dft-r2hc.Plo
-rm -f ./$(DEPDIR)/dht-r2hc.Plo
-rm -f ./$(DEPDIR)/dht-rader.Plo
-rm -f ./$(DEPDIR)/direct-r2c.Plo
-rm -f ./$(DEPDIR)/direct-r2r.Plo
-rm -f ./$(DEPDIR)/direct2.Plo
-rm -f ./$(DEPDIR)/generic.Plo
-rm -f ./$(DEPDIR)/hc2hc-direct.Plo
-rm -f ./$(DEPDIR)/hc2hc-generic.Plo
-rm -f ./$(DEPDIR)/hc2hc.Plo
-rm -f ./$(DEPDIR)/indirect.Plo
-rm -f ./$(DEPDIR)/khc2c.Plo
-rm -f ./$(DEPDIR)/khc2hc.Plo
-rm -f ./$(DEPDIR)/kr2c.Plo
-rm -f ./$(DEPDIR)/kr2r.Plo
-rm -f ./$(DEPDIR)/nop.Plo
-rm -f ./$(DEPDIR)/nop2.Plo
-rm -f ./$(DEPDIR)/plan.Plo
-rm -f ./$(DEPDIR)/plan2.Plo
-rm -f ./$(DEPDIR)/problem.Plo
-rm -f ./$(DEPDIR)/problem2.Plo
-rm -f ./$(DEPDIR)/rank-geq2-rdft2.Plo
-rm -f ./$(DEPDIR)/rank-geq2.Plo
-rm -f ./$(DEPDIR)/rank0-rdft2.Plo
-rm -f ./$(DEPDIR)/rank0.Plo
-rm -f ./$(DEPDIR)/rdft-dht.Plo
-rm -f ./$(DEPDIR)/rdft2-inplace-strides.Plo
-rm -f ./$(DEPDIR)/rdft2-rdft.Plo
-rm -f ./$(DEPDIR)/rdft2-strides.Plo
-rm -f ./$(DEPDIR)/rdft2-tensor-max-index.Plo
-rm -f ./$(DEPDIR)/solve.Plo
-rm -f ./$(DEPDIR)/solve2.Plo
-rm -f ./$(DEPDIR)/vrank-geq1-rdft2.Plo
-rm -f ./$(DEPDIR)/vrank-geq1.Plo
-rm -f ./$(DEPDIR)/vrank3-transpose.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-recursive
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-recursive
pdf-am:
ps: ps-recursive
ps-am:
uninstall-am:
.MAKE: $(am__recursive_targets) install-am install-strip
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
am--depfiles check check-am clean clean-generic clean-libtool \
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
distclean-compile distclean-generic distclean-libtool \
distclean-tags distdir dvi dvi-am html html-am info info-am \
install install-am install-data install-data-am install-dvi \
install-dvi-am install-exec install-exec-am install-html \
install-html-am install-info install-info-am install-man \
install-pdf install-pdf-am install-ps install-ps-am \
install-strip installcheck installcheck-am installdirs \
installdirs-am maintainer-clean maintainer-clean-generic \
mostlyclean mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

337
fftw-3.3.10/rdft/buffered.c Normal file
View File

@@ -0,0 +1,337 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
typedef struct {
solver super;
size_t maxnbuf_ndx;
} S;
static const INT maxnbufs[] = { 8, 256 };
typedef struct {
plan_rdft super;
plan *cld, *cldcpy, *cldrest;
INT n, vl, nbuf, bufdist;
INT ivs_by_nbuf, ovs_by_nbuf;
} P;
/* transform a vector input with the help of bufs */
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld = (plan_rdft *) ego->cld;
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
plan_rdft *cldrest;
INT i, vl = ego->vl, nbuf = ego->nbuf;
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
R *bufs;
bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
for (i = nbuf; i <= vl; i += nbuf) {
/* transform to bufs: */
cld->apply((plan *) cld, I, bufs);
I += ivs_by_nbuf;
/* copy back */
cldcpy->apply((plan *) cldcpy, bufs, O);
O += ovs_by_nbuf;
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_rdft *) ego->cldrest;
cldrest->apply((plan *) cldrest, I, O);
}
/* for hc2r problems, copy the input into buffer, and then
transform buffer->output, which allows for destruction of the
buffer */
static void apply_hc2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld = (plan_rdft *) ego->cld;
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
plan_rdft *cldrest;
INT i, vl = ego->vl, nbuf = ego->nbuf;
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
R *bufs;
bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
for (i = nbuf; i <= vl; i += nbuf) {
/* copy input into bufs: */
cldcpy->apply((plan *) cldcpy, I, bufs);
I += ivs_by_nbuf;
/* transform to output */
cld->apply((plan *) cld, bufs, O);
O += ovs_by_nbuf;
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_rdft *) ego->cldrest;
cldrest->apply((plan *) cldrest, I, O);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldcpy, wakefulness);
X(plan_awake)(ego->cldrest, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldrest);
X(plan_destroy_internal)(ego->cldcpy);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(rdft-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
ego->n, ego->nbuf,
ego->vl, ego->bufdist % ego->n,
ego->cld, ego->cldcpy, ego->cldrest);
}
static int applicable0(const S *ego, const problem *p_, const planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
iodim *d = p->sz->dims;
if (1
&& p->vecsz->rnk <= 1
&& p->sz->rnk == 1
) {
INT vl, ivs, ovs;
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
return 0;
/* if this solver is redundant, in the sense that a solver
of lower index generates the same plan, then prune this
solver */
if (X(nbuf_redundant)(d[0].n, vl,
ego->maxnbuf_ndx,
maxnbufs, NELEM(maxnbufs)))
return 0;
if (p->I != p->O) {
if (p->kind[0] == HC2R) {
/* Allow HC2R problems only if the input is to be
preserved. This solver sets NO_DESTROY_INPUT,
which prevents infinite loops */
return (NO_DESTROY_INPUTP(plnr));
} else {
/*
In principle, the buffered transforms might be useful
when working out of place. However, in order to
prevent infinite loops in the planner, we require
that the output stride of the buffered transforms be
greater than 1.
*/
return (d[0].os > 1);
}
}
/*
* If the problem is in place, the input/output strides must
* be the same or the whole thing must fit in the buffer.
*/
if (X(tensor_inplace_strides2)(p->sz, p->vecsz))
return 1;
if (/* fits into buffer: */
((p->vecsz->rnk == 0)
||
(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
maxnbufs[ego->maxnbuf_ndx])
== p->vecsz->dims[0].n)))
return 1;
}
return 0;
}
static int applicable(const S *ego, const problem *p_, const planner *plnr)
{
const problem_rdft *p;
if (NO_BUFFERINGP(plnr)) return 0;
if (!applicable0(ego, p_, plnr)) return 0;
p = (const problem_rdft *) p_;
if (p->kind[0] == HC2R) {
if (NO_UGLYP(plnr)) {
/* UGLY if in-place and too big, since the problem
could be solved via transpositions */
if (p->I == p->O && X(toobig)(p->sz->dims[0].n))
return 0;
}
} else {
if (NO_UGLYP(plnr)) {
if (p->I != p->O) return 0;
if (X(toobig)(p->sz->dims[0].n)) return 0;
}
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
P *pln;
const S *ego = (const S *)ego_;
plan *cld = (plan *) 0;
plan *cldcpy = (plan *) 0;
plan *cldrest = (plan *) 0;
const problem_rdft *p = (const problem_rdft *) p_;
R *bufs = (R *) 0;
INT nbuf = 0, bufdist, n, vl;
INT ivs, ovs;
int hc2rp;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (!applicable(ego, p_, plnr))
goto nada;
n = X(tensor_sz)(p->sz);
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
hc2rp = (p->kind[0] == HC2R);
nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
bufdist = X(bufdist)(n, vl);
A(nbuf > 0);
/* initial allocation for the purpose of planning */
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
if (hc2rp) {
/* allow destruction of buffer */
cld = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_d)(
X(mktensor_1d)(n, 1, p->sz->dims[0].os),
X(mktensor_1d)(nbuf, bufdist, ovs),
bufs, TAINT(p->O, ovs * nbuf), p->kind),
0, 0, NO_DESTROY_INPUT);
if (!cld) goto nada;
/* copying input into buffer buffer is a rank-0 transform: */
cldcpy = X(mkplan_d)(plnr,
X(mkproblem_rdft_0_d)(
X(mktensor_2d)(nbuf, ivs, bufdist,
n, p->sz->dims[0].is, 1),
TAINT(p->I, ivs * nbuf), bufs));
if (!cldcpy) goto nada;
} else {
/* allow destruction of input if problem is in place */
cld = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_d)(
X(mktensor_1d)(n, p->sz->dims[0].is, 1),
X(mktensor_1d)(nbuf, ivs, bufdist),
TAINT(p->I, ivs * nbuf), bufs, p->kind),
0, 0, (p->I == p->O) ? NO_DESTROY_INPUT : 0);
if (!cld) goto nada;
/* copying back from the buffer is a rank-0 transform: */
cldcpy = X(mkplan_d)(plnr,
X(mkproblem_rdft_0_d)(
X(mktensor_2d)(nbuf, bufdist, ovs,
n, 1, p->sz->dims[0].os),
bufs, TAINT(p->O, ovs * nbuf)));
if (!cldcpy) goto nada;
}
/* deallocate buffers, let apply() allocate them for real */
X(ifree)(bufs);
bufs = 0;
/* plan the leftover transforms (cldrest): */
{
INT id = ivs * (nbuf * (vl / nbuf));
INT od = ovs * (nbuf * (vl / nbuf));
cldrest = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(
X(tensor_copy)(p->sz),
X(mktensor_1d)(vl % nbuf, ivs, ovs),
p->I + id, p->O + od, p->kind));
}
if (!cldrest) goto nada;
pln = MKPLAN_RDFT(P, &padt, hc2rp ? apply_hc2r : apply);
pln->cld = cld;
pln->cldcpy = cldcpy;
pln->cldrest = cldrest;
pln->n = n;
pln->vl = vl;
pln->ivs_by_nbuf = ivs * nbuf;
pln->ovs_by_nbuf = ovs * nbuf;
pln->nbuf = nbuf;
pln->bufdist = bufdist;
{
opcnt t;
X(ops_add)(&cld->ops, &cldcpy->ops, &t);
X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
}
return &(pln->super.super);
nada:
X(ifree0)(bufs);
X(plan_destroy_internal)(cldrest);
X(plan_destroy_internal)(cldcpy);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
static solver *mksolver(size_t maxnbuf_ndx)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->maxnbuf_ndx = maxnbuf_ndx;
return &(slv->super);
}
void X(rdft_buffered_register)(planner *p)
{
size_t i;
for (i = 0; i < NELEM(maxnbufs); ++i)
REGISTER_SOLVER(p, mksolver(i));
}

View File

@@ -0,0 +1,375 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* buffering of rdft2. We always buffer the complex array */
#include "rdft/rdft.h"
#include "dft/dft.h"
typedef struct {
solver super;
size_t maxnbuf_ndx;
} S;
static const INT maxnbufs[] = { 8, 256 };
typedef struct {
plan_rdft2 super;
plan *cld, *cldcpy, *cldrest;
INT n, vl, nbuf, bufdist;
INT ivs_by_nbuf, ovs_by_nbuf;
INT ioffset, roffset;
} P;
/* transform a vector input with the help of bufs */
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
INT i, vl = ego->vl, nbuf = ego->nbuf;
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
R *bufr = bufs + ego->roffset;
R *bufi = bufs + ego->ioffset;
plan_rdft2 *cldrest;
for (i = nbuf; i <= vl; i += nbuf) {
/* transform to bufs: */
cld->apply((plan *) cld, r0, r1, bufr, bufi);
r0 += ivs_by_nbuf; r1 += ivs_by_nbuf;
/* copy back */
cldcpy->apply((plan *) cldcpy, bufr, bufi, cr, ci);
cr += ovs_by_nbuf; ci += ovs_by_nbuf;
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_rdft2 *) ego->cldrest;
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
}
/* for hc2r problems, copy the input into buffer, and then
transform buffer->output, which allows for destruction of the
buffer */
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld = (plan_rdft2 *) ego->cld;
plan_dft *cldcpy = (plan_dft *) ego->cldcpy;
INT i, vl = ego->vl, nbuf = ego->nbuf;
INT ivs_by_nbuf = ego->ivs_by_nbuf, ovs_by_nbuf = ego->ovs_by_nbuf;
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * ego->bufdist, BUFFERS);
R *bufr = bufs + ego->roffset;
R *bufi = bufs + ego->ioffset;
plan_rdft2 *cldrest;
for (i = nbuf; i <= vl; i += nbuf) {
/* copy input into bufs: */
cldcpy->apply((plan *) cldcpy, cr, ci, bufr, bufi);
cr += ivs_by_nbuf; ci += ivs_by_nbuf;
/* transform to output */
cld->apply((plan *) cld, r0, r1, bufr, bufi);
r0 += ovs_by_nbuf; r1 += ovs_by_nbuf;
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_rdft2 *) ego->cldrest;
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldcpy, wakefulness);
X(plan_awake)(ego->cldrest, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldrest);
X(plan_destroy_internal)(ego->cldcpy);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(rdft2-buffered-%D%v/%D-%D%(%p%)%(%p%)%(%p%))",
ego->n, ego->nbuf,
ego->vl, ego->bufdist % ego->n,
ego->cld, ego->cldcpy, ego->cldrest);
}
static int applicable0(const S *ego, const problem *p_, const planner *plnr)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
iodim *d = p->sz->dims;
if (1
&& p->vecsz->rnk <= 1
&& p->sz->rnk == 1
/* we assume even n throughout */
&& (d[0].n % 2) == 0
/* and we only consider these two cases */
&& (p->kind == R2HC || p->kind == HC2R)
) {
INT vl, ivs, ovs;
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
if (X(toobig)(d[0].n) && CONSERVE_MEMORYP(plnr))
return 0;
/* if this solver is redundant, in the sense that a solver
of lower index generates the same plan, then prune this
solver */
if (X(nbuf_redundant)(d[0].n, vl,
ego->maxnbuf_ndx,
maxnbufs, NELEM(maxnbufs)))
return 0;
if (p->r0 != p->cr) {
if (p->kind == HC2R) {
/* Allow HC2R problems only if the input is to be
preserved. This solver sets NO_DESTROY_INPUT,
which prevents infinite loops */
return (NO_DESTROY_INPUTP(plnr));
} else {
/*
In principle, the buffered transforms might be useful
when working out of place. However, in order to
prevent infinite loops in the planner, we require
that the output stride of the buffered transforms be
greater than 2.
*/
return (d[0].os > 2);
}
}
/*
* If the problem is in place, the input/output strides must
* be the same or the whole thing must fit in the buffer.
*/
if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
return 1;
if (/* fits into buffer: */
((p->vecsz->rnk == 0)
||
(X(nbuf)(d[0].n, p->vecsz->dims[0].n,
maxnbufs[ego->maxnbuf_ndx])
== p->vecsz->dims[0].n)))
return 1;
}
return 0;
}
static int applicable(const S *ego, const problem *p_, const planner *plnr)
{
const problem_rdft2 *p;
if (NO_BUFFERINGP(plnr)) return 0;
if (!applicable0(ego, p_, plnr)) return 0;
p = (const problem_rdft2 *) p_;
if (p->kind == HC2R) {
if (NO_UGLYP(plnr)) {
/* UGLY if in-place and too big, since the problem
could be solved via transpositions */
if (p->r0 == p->cr && X(toobig)(p->sz->dims[0].n))
return 0;
}
} else {
if (NO_UGLYP(plnr)) {
if (p->r0 != p->cr || X(toobig)(p->sz->dims[0].n))
return 0;
}
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
P *pln;
const S *ego = (const S *)ego_;
plan *cld = (plan *) 0;
plan *cldcpy = (plan *) 0;
plan *cldrest = (plan *) 0;
const problem_rdft2 *p = (const problem_rdft2 *) p_;
R *bufs = (R *) 0;
INT nbuf = 0, bufdist, n, vl;
INT ivs, ovs, ioffset, roffset, id, od;
static const plan_adt padt = {
X(rdft2_solve), awake, print, destroy
};
if (!applicable(ego, p_, plnr))
goto nada;
n = X(tensor_sz)(p->sz);
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
nbuf = X(nbuf)(n, vl, maxnbufs[ego->maxnbuf_ndx]);
bufdist = X(bufdist)(n + 2, vl); /* complex-side rdft2 stores N+2
real numbers */
A(nbuf > 0);
/* attempt to keep real and imaginary part in the same order,
so as to allow optimizations in the the copy plan */
roffset = (p->cr - p->ci > 0) ? (INT)1 : (INT)0;
ioffset = 1 - roffset;
/* initial allocation for the purpose of planning */
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
id = ivs * (nbuf * (vl / nbuf));
od = ovs * (nbuf * (vl / nbuf));
if (p->kind == R2HC) {
/* allow destruction of input if problem is in place */
cld = X(mkplan_f_d)(
plnr,
X(mkproblem_rdft2_d)(
X(mktensor_1d)(n, p->sz->dims[0].is, 2),
X(mktensor_1d)(nbuf, ivs, bufdist),
TAINT(p->r0, ivs * nbuf), TAINT(p->r1, ivs * nbuf),
bufs + roffset, bufs + ioffset, p->kind),
0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
if (!cld) goto nada;
/* copying back from the buffer is a rank-0 DFT: */
cldcpy = X(mkplan_d)(
plnr,
X(mkproblem_dft_d)(
X(mktensor_0d)(),
X(mktensor_2d)(nbuf, bufdist, ovs,
n/2+1, 2, p->sz->dims[0].os),
bufs + roffset, bufs + ioffset,
TAINT(p->cr, ovs * nbuf), TAINT(p->ci, ovs * nbuf) ));
if (!cldcpy) goto nada;
X(ifree)(bufs); bufs = 0;
cldrest = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(
X(tensor_copy)(p->sz),
X(mktensor_1d)(vl % nbuf, ivs, ovs),
p->r0 + id, p->r1 + id,
p->cr + od, p->ci + od,
p->kind));
if (!cldrest) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
} else {
/* allow destruction of buffer */
cld = X(mkplan_f_d)(
plnr,
X(mkproblem_rdft2_d)(
X(mktensor_1d)(n, 2, p->sz->dims[0].os),
X(mktensor_1d)(nbuf, bufdist, ovs),
TAINT(p->r0, ovs * nbuf), TAINT(p->r1, ovs * nbuf),
bufs + roffset, bufs + ioffset, p->kind),
0, 0, NO_DESTROY_INPUT);
if (!cld) goto nada;
/* copying input into buffer is a rank-0 DFT: */
cldcpy = X(mkplan_d)(
plnr,
X(mkproblem_dft_d)(
X(mktensor_0d)(),
X(mktensor_2d)(nbuf, ivs, bufdist,
n/2+1, p->sz->dims[0].is, 2),
TAINT(p->cr, ivs * nbuf), TAINT(p->ci, ivs * nbuf),
bufs + roffset, bufs + ioffset));
if (!cldcpy) goto nada;
X(ifree)(bufs); bufs = 0;
cldrest = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(
X(tensor_copy)(p->sz),
X(mktensor_1d)(vl % nbuf, ivs, ovs),
p->r0 + od, p->r1 + od,
p->cr + id, p->ci + id,
p->kind));
if (!cldrest) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
}
pln->cld = cld;
pln->cldcpy = cldcpy;
pln->cldrest = cldrest;
pln->n = n;
pln->vl = vl;
pln->ivs_by_nbuf = ivs * nbuf;
pln->ovs_by_nbuf = ovs * nbuf;
pln->roffset = roffset;
pln->ioffset = ioffset;
pln->nbuf = nbuf;
pln->bufdist = bufdist;
{
opcnt t;
X(ops_add)(&cld->ops, &cldcpy->ops, &t);
X(ops_madd)(vl / nbuf, &t, &cldrest->ops, &pln->super.super.ops);
}
return &(pln->super.super);
nada:
X(ifree0)(bufs);
X(plan_destroy_internal)(cldrest);
X(plan_destroy_internal)(cldcpy);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
static solver *mksolver(size_t maxnbuf_ndx)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->maxnbuf_ndx = maxnbuf_ndx;
return &(slv->super);
}
void X(rdft2_buffered_register)(planner *p)
{
size_t i;
for (i = 0; i < NELEM(maxnbufs); ++i)
REGISTER_SOLVER(p, mksolver(i));
}

View File

@@ -0,0 +1,172 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/*
* This header file must include every file or define every
* type or macro which is required to compile a codelet.
*/
#ifndef __RDFT_CODELET_H__
#define __RDFT_CODELET_H__
#include "kernel/ifftw.h"
/**************************************************************
* types of codelets
**************************************************************/
/* FOOab, with a,b in {0,1}, denotes the FOO transform
where a/b say whether the input/output are shifted by
half a sample/slot. */
typedef enum {
R2HC00, R2HC01, R2HC10, R2HC11,
HC2R00, HC2R01, HC2R10, HC2R11,
DHT,
REDFT00, REDFT01, REDFT10, REDFT11, /* real-even == DCT's */
RODFT00, RODFT01, RODFT10, RODFT11 /* real-odd == DST's */
} rdft_kind;
/* standard R2HC/HC2R transforms are unshifted */
#define R2HC R2HC00
#define HC2R HC2R00
#define R2HCII R2HC01
#define HC2RIII HC2R10
/* (k) >= R2HC00 produces a warning under gcc because checking x >= 0
is superfluous for unsigned values...but it is needed because other
compilers (e.g. icc) may define the enum to be a signed int...grrr. */
#define R2HC_KINDP(k) ((k) >= R2HC00 && (k) <= R2HC11) /* uses kr2hc_genus */
#define HC2R_KINDP(k) ((k) >= HC2R00 && (k) <= HC2R11) /* uses khc2r_genus */
#define R2R_KINDP(k) ((k) >= DHT) /* uses kr2r_genus */
#define REDFT_KINDP(k) ((k) >= REDFT00 && (k) <= REDFT11)
#define RODFT_KINDP(k) ((k) >= RODFT00 && (k) <= RODFT11)
#define REODFT_KINDP(k) ((k) >= REDFT00 && (k) <= RODFT11)
/* codelets with real input (output) and complex output (input) */
typedef struct kr2c_desc_s kr2c_desc;
typedef struct {
rdft_kind kind;
INT vl;
} kr2c_genus;
struct kr2c_desc_s {
INT n; /* size of transform computed */
const char *nam;
opcnt ops;
const kr2c_genus *genus;
};
typedef void (*kr2c) (R *R0, R *R1, R *Cr, R *Ci,
stride rs, stride csr, stride csi,
INT vl, INT ivs, INT ovs);
void X(kr2c_register)(planner *p, kr2c codelet, const kr2c_desc *desc);
/* half-complex to half-complex DIT/DIF codelets: */
typedef struct hc2hc_desc_s hc2hc_desc;
typedef struct {
rdft_kind kind;
INT vl;
} hc2hc_genus;
struct hc2hc_desc_s {
INT radix;
const char *nam;
const tw_instr *tw;
const hc2hc_genus *genus;
opcnt ops;
};
typedef void (*khc2hc) (R *rioarray, R *iioarray, const R *W,
stride rs, INT mb, INT me, INT ms);
void X(khc2hc_register)(planner *p, khc2hc codelet, const hc2hc_desc *desc);
/* half-complex to rdft2-complex DIT/DIF codelets: */
typedef struct hc2c_desc_s hc2c_desc;
typedef enum {
HC2C_VIA_RDFT,
HC2C_VIA_DFT
} hc2c_kind;
typedef struct {
int (*okp)(
const R *Rp, const R *Ip, const R *Rm, const R *Im,
INT rs, INT mb, INT me, INT ms,
const planner *plnr);
rdft_kind kind;
INT vl;
} hc2c_genus;
struct hc2c_desc_s {
INT radix;
const char *nam;
const tw_instr *tw;
const hc2c_genus *genus;
opcnt ops;
};
typedef void (*khc2c) (R *Rp, R *Ip, R *Rm, R *Im, const R *W,
stride rs, INT mb, INT me, INT ms);
void X(khc2c_register)(planner *p, khc2c codelet, const hc2c_desc *desc,
hc2c_kind hc2ckind);
extern const solvtab X(solvtab_rdft_r2cf);
extern const solvtab X(solvtab_rdft_r2cb);
extern const solvtab X(solvtab_rdft_sse2);
extern const solvtab X(solvtab_rdft_avx);
extern const solvtab X(solvtab_rdft_avx_128_fma);
extern const solvtab X(solvtab_rdft_avx2);
extern const solvtab X(solvtab_rdft_avx2_128);
extern const solvtab X(solvtab_rdft_avx512);
extern const solvtab X(solvtab_rdft_kcvi);
extern const solvtab X(solvtab_rdft_altivec);
extern const solvtab X(solvtab_rdft_vsx);
extern const solvtab X(solvtab_rdft_neon);
extern const solvtab X(solvtab_rdft_generic_simd128);
extern const solvtab X(solvtab_rdft_generic_simd256);
/* real-input & output DFT-like codelets (DHT, etc.) */
typedef struct kr2r_desc_s kr2r_desc;
typedef struct {
INT vl;
} kr2r_genus;
struct kr2r_desc_s {
INT n; /* size of transform computed */
const char *nam;
opcnt ops;
const kr2r_genus *genus;
rdft_kind kind;
};
typedef void (*kr2r) (const R *I, R *O, stride is, stride os,
INT vl, INT ivs, INT ovs);
void X(kr2r_register)(planner *p, kr2r codelet, const kr2r_desc *desc);
extern const solvtab X(solvtab_rdft_r2r);
#endif /* __RDFT_CODELET_H__ */

105
fftw-3.3.10/rdft/conf.c Normal file
View File

@@ -0,0 +1,105 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
static const solvtab s =
{
SOLVTAB(X(rdft_indirect_register)),
SOLVTAB(X(rdft_rank0_register)),
SOLVTAB(X(rdft_vrank3_transpose_register)),
SOLVTAB(X(rdft_vrank_geq1_register)),
SOLVTAB(X(rdft_nop_register)),
SOLVTAB(X(rdft_buffered_register)),
SOLVTAB(X(rdft_generic_register)),
SOLVTAB(X(rdft_rank_geq2_register)),
SOLVTAB(X(dft_r2hc_register)),
SOLVTAB(X(rdft_dht_register)),
SOLVTAB(X(dht_r2hc_register)),
SOLVTAB(X(dht_rader_register)),
SOLVTAB(X(rdft2_vrank_geq1_register)),
SOLVTAB(X(rdft2_nop_register)),
SOLVTAB(X(rdft2_rank0_register)),
SOLVTAB(X(rdft2_buffered_register)),
SOLVTAB(X(rdft2_rank_geq2_register)),
SOLVTAB(X(rdft2_rdft_register)),
SOLVTAB(X(hc2hc_generic_register)),
SOLVTAB_END
};
void X(rdft_conf_standard)(planner *p)
{
X(solvtab_exec)(s, p);
X(solvtab_exec)(X(solvtab_rdft_r2cf), p);
X(solvtab_exec)(X(solvtab_rdft_r2cb), p);
X(solvtab_exec)(X(solvtab_rdft_r2r), p);
#if HAVE_SSE2
if (X(have_simd_sse2)())
X(solvtab_exec)(X(solvtab_rdft_sse2), p);
#endif
#if HAVE_AVX
if (X(have_simd_avx)())
X(solvtab_exec)(X(solvtab_rdft_avx), p);
#endif
#if HAVE_AVX_128_FMA
if (X(have_simd_avx_128_fma)())
X(solvtab_exec)(X(solvtab_rdft_avx_128_fma), p);
#endif
#if HAVE_AVX2
if (X(have_simd_avx2)())
X(solvtab_exec)(X(solvtab_rdft_avx2), p);
if (X(have_simd_avx2_128)())
X(solvtab_exec)(X(solvtab_rdft_avx2_128), p);
#endif
#if HAVE_AVX512
if (X(have_simd_avx512)())
X(solvtab_exec)(X(solvtab_rdft_avx512), p);
#endif
#if HAVE_KCVI
if (X(have_simd_kcvi)())
X(solvtab_exec)(X(solvtab_rdft_kcvi), p);
#endif
#if HAVE_ALTIVEC
if (X(have_simd_altivec)())
X(solvtab_exec)(X(solvtab_rdft_altivec), p);
#endif
#if HAVE_VSX
if (X(have_simd_vsx)())
X(solvtab_exec)(X(solvtab_rdft_vsx), p);
#endif
#if HAVE_NEON
if (X(have_simd_neon)())
X(solvtab_exec)(X(solvtab_rdft_neon), p);
#endif
#if HAVE_GENERIC_SIMD128
X(solvtab_exec)(X(solvtab_rdft_generic_simd128), p);
#endif
#if HAVE_GENERIC_SIMD256
X(solvtab_exec)(X(solvtab_rdft_generic_simd256), p);
#endif
}

View File

@@ -0,0 +1,404 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ct-hc2c.h"
typedef struct {
hc2c_solver super;
const hc2c_desc *desc;
int bufferedp;
khc2c k;
} S;
typedef struct {
plan_hc2c super;
khc2c k;
plan *cld0, *cldm; /* children for 0th and middle butterflies */
INT r, m, v, extra_iter;
INT ms, vs;
stride rs, brs;
twid *td;
const S *slv;
} P;
/*************************************************************
Nonbuffered code
*************************************************************/
static void apply(const plan *ego_, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
INT i, m = ego->m, v = ego->v;
INT ms = ego->ms, vs = ego->vs;
for (i = 0; i < v; ++i, cr += vs, ci += vs) {
cld0->apply((plan *) cld0, cr, ci, cr, ci);
ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
ego->td->W, ego->rs, 1, (m+1)/2, ms);
cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
cr + (m/2)*ms, ci + (m/2)*ms);
}
}
static void apply_extra_iter(const plan *ego_, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
INT i, m = ego->m, v = ego->v;
INT ms = ego->ms, vs = ego->vs;
INT mm = (m-1)/2;
for (i = 0; i < v; ++i, cr += vs, ci += vs) {
cld0->apply((plan *) cld0, cr, ci, cr, ci);
/* for 4-way SIMD when (m+1)/2-1 is odd: iterate over an
even vector length MM-1, and then execute the last
iteration as a 2-vector with vector stride 0. The
twiddle factors of the second half of the last iteration
are bogus, but we only store the results of the first
half. */
ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
ego->td->W, ego->rs, 1, mm, ms);
ego->k(cr + mm*ms, ci + mm*ms, cr + (m-mm)*ms, ci + (m-mm)*ms,
ego->td->W, ego->rs, mm, mm+2, 0);
cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
cr + (m/2)*ms, ci + (m/2)*ms);
}
}
/*************************************************************
Buffered code
*************************************************************/
/* should not be 2^k to avoid associativity conflicts */
static INT compute_batchsize(INT radix)
{
/* round up to multiple of 4 */
radix += 3;
radix &= -4;
return (radix + 2);
}
static void dobatch(const P *ego, R *Rp, R *Ip, R *Rm, R *Im,
INT mb, INT me, INT extra_iter, R *bufp)
{
INT b = WS(ego->brs, 1);
INT rs = WS(ego->rs, 1);
INT ms = ego->ms;
R *bufm = bufp + b - 2;
INT n = me - mb;
X(cpy2d_pair_ci)(Rp + mb * ms, Ip + mb * ms, bufp, bufp + 1,
ego->r / 2, rs, b,
n, ms, 2);
X(cpy2d_pair_ci)(Rm - mb * ms, Im - mb * ms, bufm, bufm + 1,
ego->r / 2, rs, b,
n, -ms, -2);
if (extra_iter) {
/* initialize the extra_iter element to 0. It would be ok
to leave it uninitialized, since we transform uninitialized
data and ignore the result. However, we want to avoid
FP exceptions in case somebody is trapping them. */
A(n < compute_batchsize(ego->r));
X(zero1d_pair)(bufp + 2*n, bufp + 1 + 2*n, ego->r / 2, b);
X(zero1d_pair)(bufm - 2*n, bufm + 1 - 2*n, ego->r / 2, b);
}
ego->k(bufp, bufp + 1, bufm, bufm + 1, ego->td->W,
ego->brs, mb, me + extra_iter, 2);
X(cpy2d_pair_co)(bufp, bufp + 1, Rp + mb * ms, Ip + mb * ms,
ego->r / 2, b, rs,
n, 2, ms);
X(cpy2d_pair_co)(bufm, bufm + 1, Rm - mb * ms, Im - mb * ms,
ego->r / 2, b, rs,
n, -2, -ms);
}
static void apply_buf(const plan *ego_, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
INT i, j, ms = ego->ms, v = ego->v;
INT batchsz = compute_batchsize(ego->r);
R *buf;
INT mb = 1, me = (ego->m+1) / 2;
size_t bufsz = ego->r * batchsz * 2 * sizeof(R);
BUF_ALLOC(R *, buf, bufsz);
for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
R *Rp = cr;
R *Ip = ci;
R *Rm = cr + ego->m * ms;
R *Im = ci + ego->m * ms;
cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);
for (j = mb; j + batchsz < me; j += batchsz)
dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);
dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);
cldm->apply((plan *) cldm,
Rp + me * ms, Ip + me * ms,
Rp + me * ms, Ip + me * ms);
}
BUF_FREE(buf, bufsz);
}
/*************************************************************
common code
*************************************************************/
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld0, wakefulness);
X(plan_awake)(ego->cldm, wakefulness);
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
ego->r * ego->m, ego->r,
(ego->m - 1) / 2 + ego->extra_iter);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld0);
X(plan_destroy_internal)(ego->cldm);
X(stride_destroy)(ego->rs);
X(stride_destroy)(ego->brs);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *slv = ego->slv;
const hc2c_desc *e = slv->desc;
if (slv->bufferedp)
p->print(p, "(hc2c-directbuf/%D-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
compute_batchsize(ego->r),
ego->r, X(twiddle_length)(ego->r, e->tw),
ego->extra_iter, ego->v, e->nam,
ego->cld0, ego->cldm);
else
p->print(p, "(hc2c-direct-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
ego->r, X(twiddle_length)(ego->r, e->tw),
ego->extra_iter, ego->v, e->nam,
ego->cld0, ego->cldm);
}
static int applicable0(const S *ego, rdft_kind kind,
INT r, INT rs,
INT m, INT ms,
INT v, INT vs,
const R *cr, const R *ci,
const planner *plnr,
INT *extra_iter)
{
const hc2c_desc *e = ego->desc;
UNUSED(v);
return (
1
&& r == e->radix
&& kind == e->genus->kind
/* first v-loop iteration */
&& ((*extra_iter = 0,
e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
rs, 1, (m+1)/2, ms, plnr))
||
(*extra_iter = 1,
((e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
rs, 1, (m-1)/2, ms, plnr))
&&
(e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
rs, (m-1)/2, (m-1)/2 + 2, 0, plnr)))))
/* subsequent v-loop iterations */
&& (cr += vs, ci += vs, 1)
&& e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
rs, 1, (m+1)/2 - *extra_iter, ms, plnr)
);
}
static int applicable0_buf(const S *ego, rdft_kind kind,
INT r, INT rs,
INT m, INT ms,
INT v, INT vs,
const R *cr, const R *ci,
const planner *plnr, INT *extra_iter)
{
const hc2c_desc *e = ego->desc;
INT batchsz, brs;
UNUSED(v); UNUSED(rs); UNUSED(ms); UNUSED(vs);
return (
1
&& r == e->radix
&& kind == e->genus->kind
/* ignore cr, ci, use buffer */
&& (cr = (const R *)0, ci = cr + 1,
batchsz = compute_batchsize(r),
brs = 4 * batchsz, 1)
&& e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
brs, 1, 1+batchsz, 2, plnr)
&& ((*extra_iter = 0,
e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
brs, 1, 1 + (((m-1)/2) % batchsz), 2, plnr))
||
(*extra_iter = 1,
e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
brs, 1, 1 + 1 + (((m-1)/2) % batchsz), 2, plnr)))
);
}
static int applicable(const S *ego, rdft_kind kind,
INT r, INT rs,
INT m, INT ms,
INT v, INT vs,
R *cr, R *ci,
const planner *plnr, INT *extra_iter)
{
if (ego->bufferedp) {
if (!applicable0_buf(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
extra_iter))
return 0;
} else {
if (!applicable0(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
extra_iter))
return 0;
}
if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
v, m * r, r))
return 0;
return 1;
}
static plan *mkcldw(const hc2c_solver *ego_, rdft_kind kind,
INT r, INT rs,
INT m, INT ms,
INT v, INT vs,
R *cr, R *ci,
planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const hc2c_desc *e = ego->desc;
plan *cld0 = 0, *cldm = 0;
INT imid = (m / 2) * ms;
INT extra_iter;
static const plan_adt padt = {
0, awake, print, destroy
};
if (!applicable(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
&extra_iter))
return (plan *)0;
cld0 = X(mkplan_d)(
plnr,
X(mkproblem_rdft2_d)(X(mktensor_1d)(r, rs, rs),
X(mktensor_0d)(),
TAINT(cr, vs), TAINT(ci, vs),
TAINT(cr, vs), TAINT(ci, vs),
kind));
if (!cld0) goto nada;
cldm = X(mkplan_d)(
plnr,
X(mkproblem_rdft2_d)(((m % 2) ?
X(mktensor_0d)() : X(mktensor_1d)(r, rs, rs) ),
X(mktensor_0d)(),
TAINT(cr + imid, vs), TAINT(ci + imid, vs),
TAINT(cr + imid, vs), TAINT(ci + imid, vs),
kind == R2HC ? R2HCII : HC2RIII));
if (!cldm) goto nada;
if (ego->bufferedp)
pln = MKPLAN_HC2C(P, &padt, apply_buf);
else
pln = MKPLAN_HC2C(P, &padt, extra_iter ? apply_extra_iter : apply);
pln->k = ego->k;
pln->td = 0;
pln->r = r; pln->rs = X(mkstride)(r, rs);
pln->m = m; pln->ms = ms;
pln->v = v; pln->vs = vs;
pln->slv = ego;
pln->brs = X(mkstride)(r, 4 * compute_batchsize(r));
pln->cld0 = cld0;
pln->cldm = cldm;
pln->extra_iter = extra_iter;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(v * (((m - 1) / 2) / e->genus->vl),
&e->ops, &pln->super.super.ops);
X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
if (ego->bufferedp)
pln->super.super.ops.other += 4 * r * m * v;
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld0);
X(plan_destroy_internal)(cldm);
return 0;
}
static void regone(planner *plnr, khc2c codelet,
const hc2c_desc *desc,
hc2c_kind hc2ckind,
int bufferedp)
{
S *slv = (S *)X(mksolver_hc2c)(sizeof(S), desc->radix, hc2ckind, mkcldw);
slv->k = codelet;
slv->desc = desc;
slv->bufferedp = bufferedp;
REGISTER_SOLVER(plnr, &(slv->super.super));
}
void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
const hc2c_desc *desc,
hc2c_kind hc2ckind)
{
regone(plnr, codelet, desc, hc2ckind, /* bufferedp */0);
regone(plnr, codelet, desc, hc2ckind, /* bufferedp */1);
}

296
fftw-3.3.10/rdft/ct-hc2c.c Normal file
View File

@@ -0,0 +1,296 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ct-hc2c.h"
#include "dft/dft.h"
typedef struct {
plan_rdft2 super;
plan *cld;
plan *cldw;
INT r;
} P;
static void apply_dit(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft *cld;
plan_hc2c *cldw;
UNUSED(r1);
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, r0, cr);
cldw = (plan_hc2c *) ego->cldw;
cldw->apply(ego->cldw, cr, ci);
}
static void apply_dif(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft *cld;
plan_hc2c *cldw;
UNUSED(r1);
cldw = (plan_hc2c *) ego->cldw;
cldw->apply(ego->cldw, cr, ci);
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, cr, r0);
}
static void apply_dit_dft(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
plan_hc2c *cldw;
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, r0, r1, cr, ci);
cldw = (plan_hc2c *) ego->cldw;
cldw->apply(ego->cldw, cr, ci);
}
static void apply_dif_dft(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
plan_hc2c *cldw;
cldw = (plan_hc2c *) ego->cldw;
cldw->apply(ego->cldw, cr, ci);
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, ci, cr, r1, r0);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldw, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldw);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(rdft2-ct-%s/%D%(%p%)%(%p%))",
(ego->super.apply == apply_dit ||
ego->super.apply == apply_dit_dft)
? "dit" : "dif",
ego->r, ego->cldw, ego->cld);
}
static int applicable0(const hc2c_solver *ego, const problem *p_, planner *plnr)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
INT r;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& (/* either the problem is R2HC, which is solved by DIT */
(p->kind == R2HC)
||
/* or the problem is HC2R, in which case it is solved
by DIF, which destroys the input */
(p->kind == HC2R &&
(p->r0 == p->cr || !NO_DESTROY_INPUTP(plnr))))
&& ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 0)
&& p->sz->dims[0].n > r);
}
static int hc2c_applicable(const hc2c_solver *ego, const problem *p_,
planner *plnr)
{
const problem_rdft2 *p;
if (!applicable0(ego, p_, plnr))
return 0;
p = (const problem_rdft2 *) p_;
return (0
|| p->vecsz->rnk == 0
|| !NO_VRECURSEP(plnr)
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const hc2c_solver *ego = (const hc2c_solver *) ego_;
const problem_rdft2 *p;
P *pln = 0;
plan *cld = 0, *cldw = 0;
INT n, r, m, v, ivs, ovs;
iodim *d;
static const plan_adt padt = {
X(rdft2_solve), awake, print, destroy
};
if (!hc2c_applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_rdft2 *) p_;
d = p->sz->dims;
n = d[0].n;
r = X(choose_radix)(ego->r, n);
A((r % 2) == 0);
m = n / r;
X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
switch (p->kind) {
case R2HC:
cldw = ego->mkcldw(ego, R2HC,
r, m * d[0].os,
m, d[0].os,
v, ovs,
p->cr, p->ci, plnr);
if (!cldw) goto nada;
switch (ego->hc2ckind) {
case HC2C_VIA_RDFT:
cld = X(mkplan_d)(
plnr,
X(mkproblem_rdft_1_d)(
X(mktensor_1d)(m, (r/2)*d[0].is, d[0].os),
X(mktensor_3d)(
2, p->r1 - p->r0, p->ci - p->cr,
r / 2, d[0].is, m * d[0].os,
v, ivs, ovs),
p->r0, p->cr, R2HC)
);
if (!cld) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_dit);
break;
case HC2C_VIA_DFT:
cld = X(mkplan_d)(
plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(m, (r/2)*d[0].is, d[0].os),
X(mktensor_2d)(
r / 2, d[0].is, m * d[0].os,
v, ivs, ovs),
p->r0, p->r1, p->cr, p->ci)
);
if (!cld) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_dit_dft);
break;
}
break;
case HC2R:
cldw = ego->mkcldw(ego, HC2R,
r, m * d[0].is,
m, d[0].is,
v, ivs,
p->cr, p->ci, plnr);
if (!cldw) goto nada;
switch (ego->hc2ckind) {
case HC2C_VIA_RDFT:
cld = X(mkplan_d)(
plnr,
X(mkproblem_rdft_1_d)(
X(mktensor_1d)(m, d[0].is, (r/2)*d[0].os),
X(mktensor_3d)(
2, p->ci - p->cr, p->r1 - p->r0,
r / 2, m * d[0].is, d[0].os,
v, ivs, ovs),
p->cr, p->r0, HC2R)
);
if (!cld) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_dif);
break;
case HC2C_VIA_DFT:
cld = X(mkplan_d)(
plnr,
X(mkproblem_dft_d)(
X(mktensor_1d)(m, d[0].is, (r/2)*d[0].os),
X(mktensor_2d)(
r / 2, m * d[0].is, d[0].os,
v, ivs, ovs),
p->ci, p->cr, p->r1, p->r0)
);
if (!cld) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_dif_dft);
break;
}
break;
default:
A(0);
}
pln->cld = cld;
pln->cldw = cldw;
pln->r = r;
X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
/* inherit could_prune_now_p attribute from cldw */
pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldw);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
hc2c_solver *X(mksolver_hc2c)(size_t size, INT r,
hc2c_kind hc2ckind,
hc2c_mkinferior mkcldw)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
hc2c_solver *slv = (hc2c_solver *)X(mksolver)(size, &sadt);
slv->r = r;
slv->hc2ckind = hc2ckind;
slv->mkcldw = mkcldw;
return slv;
}
plan *X(mkplan_hc2c)(size_t size, const plan_adt *adt, hc2capply apply)
{
plan_hc2c *ego;
ego = (plan_hc2c *) X(mkplan)(size, adt);
ego->apply = apply;
return &(ego->super);
}

View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
typedef void (*hc2capply) (const plan *ego, R *cr, R *ci);
typedef struct hc2c_solver_s hc2c_solver;
typedef plan *(*hc2c_mkinferior)(const hc2c_solver *ego, rdft_kind kind,
INT r, INT rs,
INT m, INT ms,
INT v, INT vs,
R *cr, R *ci,
planner *plnr);
typedef struct {
plan super;
hc2capply apply;
} plan_hc2c;
extern plan *X(mkplan_hc2c)(size_t size, const plan_adt *adt,
hc2capply apply);
#define MKPLAN_HC2C(type, adt, apply) \
(type *)X(mkplan_hc2c)(sizeof(type), adt, apply)
struct hc2c_solver_s {
solver super;
INT r;
hc2c_mkinferior mkcldw;
hc2c_kind hc2ckind;
};
hc2c_solver *X(mksolver_hc2c)(size_t size, INT r,
hc2c_kind hc2ckind,
hc2c_mkinferior mkcldw);
void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
const hc2c_desc *desc,
hc2c_kind hc2ckind);

194
fftw-3.3.10/rdft/dft-r2hc.c Normal file
View File

@@ -0,0 +1,194 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Compute the complex DFT by combining R2HC RDFTs on the real
and imaginary parts. This could be useful for people just wanting
to link to the real codelets and not the complex ones. It could
also even be faster than the complex algorithms for split (as opposed
to interleaved) real/imag complex data. */
#include "rdft/rdft.h"
#include "dft/dft.h"
typedef struct {
solver super;
} S;
typedef struct {
plan_dft super;
plan *cld;
INT ishift, oshift;
INT os;
INT n;
} P;
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
{
const P *ego = (const P *) ego_;
INT n;
UNUSED(ii);
{ /* transform vector of real & imag parts: */
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply((plan *) cld, ri + ego->ishift, ro + ego->oshift);
}
n = ego->n;
if (n > 1) {
INT i, os = ego->os;
for (i = 1; i < (n + 1)/2; ++i) {
E rop, iop, iom, rom;
rop = ro[os * i];
iop = io[os * i];
rom = ro[os * (n - i)];
iom = io[os * (n - i)];
ro[os * i] = rop - iom;
io[os * i] = iop + rom;
ro[os * (n - i)] = rop + iom;
io[os * (n - i)] = iop - rom;
}
}
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dft-r2hc-%D%(%p%))", ego->n, ego->cld);
}
static int applicable0(const problem *p_)
{
const problem_dft *p = (const problem_dft *) p_;
return ((p->sz->rnk == 1 && p->vecsz->rnk == 0)
|| (p->sz->rnk == 0 && FINITE_RNK(p->vecsz->rnk))
);
}
static int splitp(R *r, R *i, INT n, INT s)
{
return ((r > i ? (r - i) : (i - r)) >= n * (s > 0 ? s : 0-s));
}
static int applicable(const problem *p_, const planner *plnr)
{
if (!applicable0(p_)) return 0;
{
const problem_dft *p = (const problem_dft *) p_;
/* rank-0 problems are always OK */
if (p->sz->rnk == 0) return 1;
/* this solver is ok for split arrays */
if (p->sz->rnk == 1 &&
splitp(p->ri, p->ii, p->sz->dims[0].n, p->sz->dims[0].is) &&
splitp(p->ro, p->io, p->sz->dims[0].n, p->sz->dims[0].os))
return 1;
return !(NO_DFT_R2HCP(plnr));
}
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
P *pln;
const problem_dft *p;
plan *cld;
INT ishift = 0, oshift = 0;
static const plan_adt padt = {
X(dft_solve), awake, print, destroy
};
UNUSED(ego_);
if (!applicable(p_, plnr))
return (plan *)0;
p = (const problem_dft *) p_;
{
tensor *ri_vec = X(mktensor_1d)(2, p->ii - p->ri, p->io - p->ro);
tensor *cld_vec = X(tensor_append)(ri_vec, p->vecsz);
int i;
for (i = 0; i < cld_vec->rnk; ++i) { /* make all istrides > 0 */
if (cld_vec->dims[i].is < 0) {
INT nm1 = cld_vec->dims[i].n - 1;
ishift -= nm1 * (cld_vec->dims[i].is *= -1);
oshift -= nm1 * (cld_vec->dims[i].os *= -1);
}
}
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_1)(p->sz, cld_vec,
p->ri + ishift,
p->ro + oshift, R2HC));
X(tensor_destroy2)(ri_vec, cld_vec);
}
if (!cld) return (plan *)0;
pln = MKPLAN_DFT(P, &padt, apply);
if (p->sz->rnk == 0) {
pln->n = 1;
pln->os = 0;
}
else {
pln->n = p->sz->dims[0].n;
pln->os = p->sz->dims[0].os;
}
pln->ishift = ishift;
pln->oshift = oshift;
pln->cld = cld;
pln->super.super.ops = cld->ops;
pln->super.super.ops.other += 8 * ((pln->n - 1)/2);
pln->super.super.ops.add += 4 * ((pln->n - 1)/2);
pln->super.super.ops.other += 1; /* estimator hack for nop plans */
return &(pln->super.super);
}
/* constructor */
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(dft_r2hc_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

144
fftw-3.3.10/rdft/dht-r2hc.c Normal file
View File

@@ -0,0 +1,144 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Solve a DHT problem (Discrete Hartley Transform) via post-processing
of an R2HC problem. */
#include "rdft/rdft.h"
typedef struct {
solver super;
} S;
typedef struct {
plan_rdft super;
plan *cld;
INT os;
INT n;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT os = ego->os;
INT i, n = ego->n;
{
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply((plan *) cld, I, O);
}
for (i = 1; i < n - i; ++i) {
E a, b;
a = O[os * i];
b = O[os * (n - i)];
#if FFT_SIGN == -1
O[os * i] = a - b;
O[os * (n - i)] = a + b;
#else
O[os * i] = a + b;
O[os * (n - i)] = a - b;
#endif
}
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dht-r2hc-%D%(%p%))", ego->n, ego->cld);
}
static int applicable0(const problem *p_, const planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
return (1
&& !NO_DHT_R2HCP(plnr)
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& p->kind[0] == DHT
);
}
static int applicable(const solver *ego, const problem *p, const planner *plnr)
{
UNUSED(ego);
return (!NO_SLOWP(plnr) && applicable0(p, plnr));
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
P *pln;
const problem_rdft *p;
plan *cld;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr))
return (plan *)0;
p = (const problem_rdft *) p_;
/* NO_DHT_R2HC stops infinite loops with rdft-dht.c */
cld = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_1)(p->sz, p->vecsz,
p->I, p->O, R2HC),
NO_DHT_R2HC, 0, 0);
if (!cld) return (plan *)0;
pln = MKPLAN_RDFT(P, &padt, apply);
pln->n = p->sz->dims[0].n;
pln->os = p->sz->dims[0].os;
pln->cld = cld;
pln->super.super.ops = cld->ops;
pln->super.super.ops.other += 4 * ((pln->n - 1)/2);
pln->super.super.ops.add += 2 * ((pln->n - 1)/2);
return &(pln->super.super);
}
/* constructor */
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(dht_r2hc_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

View File

@@ -0,0 +1,386 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
/*
* Compute DHTs of prime sizes using Rader's trick: turn them
* into convolutions of size n - 1, which we then perform via a pair
* of FFTs. (We can then do prime real FFTs via rdft-dht.c.)
*
* Optionally (determined by the "pad" field of the solver), we can
* perform the (cyclic) convolution by zero-padding to a size
* >= 2*(n-1) - 1. This is advantageous if n-1 has large prime factors.
*
*/
typedef struct {
solver super;
int pad;
} S;
typedef struct {
plan_rdft super;
plan *cld1, *cld2;
R *omega;
INT n, npad, g, ginv;
INT is, os;
plan *cld_omega;
} P;
static rader_tl *omegas = 0;
/***************************************************************************/
/* If R2HC_ONLY_CONV is 1, we use a trick to perform the convolution
purely in terms of R2HC transforms, as opposed to R2HC followed by H2RC.
This requires a few more operations, but allows us to share the same
plan/codelets for both Rader children. */
#define R2HC_ONLY_CONV 1
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT n = ego->n; /* prime */
INT npad = ego->npad; /* == n - 1 for unpadded Rader; always even */
INT is = ego->is, os;
INT k, gpower, g;
R *buf, *omega;
R r0;
buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
/* First, permute the input, storing in buf: */
g = ego->g;
for (gpower = 1, k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
buf[k] = I[gpower * is];
}
/* gpower == g^(n-1) mod n == 1 */;
A(n - 1 <= npad);
for (k = n - 1; k < npad; ++k) /* optionally, zero-pad convolution */
buf[k] = 0;
os = ego->os;
/* compute RDFT of buf, storing in buf (i.e., in-place): */
{
plan_rdft *cld = (plan_rdft *) ego->cld1;
cld->apply((plan *) cld, buf, buf);
}
/* set output DC component: */
O[0] = (r0 = I[0]) + buf[0];
/* now, multiply by omega: */
omega = ego->omega;
buf[0] *= omega[0];
for (k = 1; k < npad/2; ++k) {
E rB, iB, rW, iW, a, b;
rW = omega[k];
iW = omega[npad - k];
rB = buf[k];
iB = buf[npad - k];
a = rW * rB - iW * iB;
b = rW * iB + iW * rB;
#if R2HC_ONLY_CONV
buf[k] = a + b;
buf[npad - k] = a - b;
#else
buf[k] = a;
buf[npad - k] = b;
#endif
}
/* Nyquist component: */
A(k + k == npad); /* since npad is even */
buf[k] *= omega[k];
/* this will add input[0] to all of the outputs after the ifft */
buf[0] += r0;
/* inverse FFT: */
{
plan_rdft *cld = (plan_rdft *) ego->cld2;
cld->apply((plan *) cld, buf, buf);
}
/* do inverse permutation to unshuffle the output: */
A(gpower == 1);
#if R2HC_ONLY_CONV
O[os] = buf[0];
gpower = g = ego->ginv;
A(npad == n - 1 || npad/2 >= n - 1);
if (npad == n - 1) {
for (k = 1; k < npad/2; ++k, gpower = MULMOD(gpower, g, n)) {
O[gpower * os] = buf[k] + buf[npad - k];
}
O[gpower * os] = buf[k];
++k, gpower = MULMOD(gpower, g, n);
for (; k < npad; ++k, gpower = MULMOD(gpower, g, n)) {
O[gpower * os] = buf[npad - k] - buf[k];
}
}
else {
for (k = 1; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
O[gpower * os] = buf[k] + buf[npad - k];
}
}
#else
g = ego->ginv;
for (k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
O[gpower * os] = buf[k];
}
#endif
A(gpower == 1);
X(ifree)(buf);
}
static R *mkomega(enum wakefulness wakefulness,
plan *p_, INT n, INT npad, INT ginv)
{
plan_rdft *p = (plan_rdft *) p_;
R *omega;
INT i, gpower;
trigreal scale;
triggen *t;
if ((omega = X(rader_tl_find)(n, npad + 1, ginv, omegas)))
return omega;
omega = (R *)MALLOC(sizeof(R) * npad, TWIDDLES);
scale = npad; /* normalization for convolution */
t = X(mktriggen)(wakefulness, n);
for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
trigreal w[2];
t->cexpl(t, gpower, w);
omega[i] = (w[0] + w[1]) / scale;
}
X(triggen_destroy)(t);
A(gpower == 1);
A(npad == n - 1 || npad >= 2*(n - 1) - 1);
for (; i < npad; ++i)
omega[i] = K(0.0);
if (npad > n - 1)
for (i = 1; i < n-1; ++i)
omega[npad - i] = omega[n - 1 - i];
p->apply(p_, omega, omega);
X(rader_tl_insert)(n, npad + 1, ginv, omega, &omegas);
return omega;
}
static void free_omega(R *omega)
{
X(rader_tl_delete)(omega, &omegas);
}
/***************************************************************************/
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
X(plan_awake)(ego->cld_omega, wakefulness);
switch (wakefulness) {
case SLEEPY:
free_omega(ego->omega);
ego->omega = 0;
break;
default:
ego->g = X(find_generator)(ego->n);
ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
A(!ego->omega);
ego->omega = mkomega(wakefulness,
ego->cld_omega,ego->n,ego->npad,ego->ginv);
break;
}
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld_omega);
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(dht-rader-%D/%D%ois=%oos=%(%p%)",
ego->n, ego->npad, ego->is, ego->os, ego->cld1);
if (ego->cld2 != ego->cld1)
p->print(p, "%(%p%)", ego->cld2);
if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
p->print(p, "%(%p%)", ego->cld_omega);
p->putchr(p, ')');
}
static int applicable(const solver *ego, const problem *p_, const planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
UNUSED(ego);
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& p->kind[0] == DHT
&& X(is_prime)(p->sz->dims[0].n)
&& p->sz->dims[0].n > 2
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
/* proclaim the solver SLOW if p-1 is not easily
factorizable. Unlike in the complex case where
Bluestein can solve the problem, in the DHT case we
may have no other choice */
&& CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
);
}
static INT choose_transform_size(INT minsz)
{
static const INT primes[] = { 2, 3, 5, 0 };
while (!X(factors_into)(minsz, primes) || minsz % 2)
++minsz;
return minsz;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_rdft *p = (const problem_rdft *) p_;
P *pln;
INT n, npad;
INT is, os;
plan *cld1 = (plan *) 0;
plan *cld2 = (plan *) 0;
plan *cld_omega = (plan *) 0;
R *buf = (R *) 0;
problem *cldp;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr))
return (plan *) 0;
n = p->sz->dims[0].n;
is = p->sz->dims[0].is;
os = p->sz->dims[0].os;
if (ego->pad)
npad = choose_transform_size(2 * (n - 1) - 1);
else
npad = n - 1;
/* initial allocation for the purpose of planning */
buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_1_d)(X(mktensor_1d)(npad, 1, 1),
X(mktensor_1d)(1, 0, 0),
buf, buf,
R2HC),
NO_SLOW, 0, 0);
if (!cld1) goto nada;
cldp =
X(mkproblem_rdft_1_d)(
X(mktensor_1d)(npad, 1, 1),
X(mktensor_1d)(1, 0, 0),
buf, buf,
#if R2HC_ONLY_CONV
R2HC
#else
HC2R
#endif
);
if (!(cld2 = X(mkplan_f_d)(plnr, cldp, NO_SLOW, 0, 0)))
goto nada;
/* plan for omega */
cld_omega = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_1_d)(
X(mktensor_1d)(npad, 1, 1),
X(mktensor_1d)(1, 0, 0),
buf, buf, R2HC),
NO_SLOW, ESTIMATE, 0);
if (!cld_omega) goto nada;
/* deallocate buffers; let awake() or apply() allocate them for real */
X(ifree)(buf);
buf = 0;
pln = MKPLAN_RDFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->cld_omega = cld_omega;
pln->omega = 0;
pln->n = n;
pln->npad = npad;
pln->is = is;
pln->os = os;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
pln->super.super.ops.other += (npad/2-1)*6 + npad + n + (n-1) * ego->pad;
pln->super.super.ops.add += (npad/2-1)*2 + 2 + (n-1) * ego->pad;
pln->super.super.ops.mul += (npad/2-1)*4 + 2 + ego->pad;
#if R2HC_ONLY_CONV
pln->super.super.ops.other += n-2 - ego->pad;
pln->super.super.ops.add += (npad/2-1)*2 + (n-2) - ego->pad;
#endif
return &(pln->super.super);
nada:
X(ifree0)(buf);
X(plan_destroy_internal)(cld_omega);
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return 0;
}
/* constructors */
static solver *mksolver(int pad)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->pad = pad;
return &(slv->super);
}
void X(dht_rader_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver(0));
REGISTER_SOLVER(p, mksolver(1));
}

View File

@@ -0,0 +1,341 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* direct RDFT solver, using r2c codelets */
#include "rdft/rdft.h"
typedef struct {
solver super;
const kr2c_desc *desc;
kr2c k;
int bufferedp;
} S;
typedef struct {
plan_rdft super;
stride rs, csr, csi;
stride brs, bcsr, bcsi;
INT n, vl, rs0, ivs, ovs, ioffset, bioffset;
kr2c k;
const S *slv;
} P;
/*************************************************************
Nonbuffered code
*************************************************************/
static void apply_r2hc(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
ASSERT_ALIGNED_DOUBLE;
ego->k(I, I + ego->rs0, O, O + ego->ioffset,
ego->rs, ego->csr, ego->csi,
ego->vl, ego->ivs, ego->ovs);
}
static void apply_hc2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
ASSERT_ALIGNED_DOUBLE;
ego->k(O, O + ego->rs0, I, I + ego->ioffset,
ego->rs, ego->csr, ego->csi,
ego->vl, ego->ivs, ego->ovs);
}
/*************************************************************
Buffered code
*************************************************************/
/* should not be 2^k to avoid associativity conflicts */
static INT compute_batchsize(INT radix)
{
/* round up to multiple of 4 */
radix += 3;
radix &= -4;
return (radix + 2);
}
static void dobatch_r2hc(const P *ego, R *I, R *O, R *buf, INT batchsz)
{
X(cpy2d_ci)(I, buf,
ego->n, ego->rs0, WS(ego->bcsr /* hack */, 1),
batchsz, ego->ivs, 1, 1);
if (IABS(WS(ego->csr, 1)) < IABS(ego->ovs)) {
/* transform directly to output */
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
O, O + ego->ioffset,
ego->brs, ego->csr, ego->csi,
batchsz, 1, ego->ovs);
} else {
/* transform to buffer and copy back */
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
buf, buf + ego->bioffset,
ego->brs, ego->bcsr, ego->bcsi,
batchsz, 1, 1);
X(cpy2d_co)(buf, O,
ego->n, WS(ego->bcsr, 1), WS(ego->csr, 1),
batchsz, 1, ego->ovs, 1);
}
}
static void dobatch_hc2r(const P *ego, R *I, R *O, R *buf, INT batchsz)
{
if (IABS(WS(ego->csr, 1)) < IABS(ego->ivs)) {
/* transform directly from input */
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
I, I + ego->ioffset,
ego->brs, ego->csr, ego->csi,
batchsz, ego->ivs, 1);
} else {
/* copy into buffer and transform in place */
X(cpy2d_ci)(I, buf,
ego->n, WS(ego->csr, 1), WS(ego->bcsr, 1),
batchsz, ego->ivs, 1, 1);
ego->k(buf, buf + WS(ego->bcsr /* hack */, 1),
buf, buf + ego->bioffset,
ego->brs, ego->bcsr, ego->bcsi,
batchsz, 1, 1);
}
X(cpy2d_co)(buf, O,
ego->n, WS(ego->bcsr /* hack */, 1), ego->rs0,
batchsz, 1, ego->ovs, 1);
}
static void iterate(const P *ego, R *I, R *O,
void (*dobatch)(const P *ego, R *I, R *O,
R *buf, INT batchsz))
{
R *buf;
INT vl = ego->vl;
INT n = ego->n;
INT i;
INT batchsz = compute_batchsize(n);
size_t bufsz = n * batchsz * sizeof(R);
BUF_ALLOC(R *, buf, bufsz);
for (i = 0; i < vl - batchsz; i += batchsz) {
dobatch(ego, I, O, buf, batchsz);
I += batchsz * ego->ivs;
O += batchsz * ego->ovs;
}
dobatch(ego, I, O, buf, vl - i);
BUF_FREE(buf, bufsz);
}
static void apply_buf_r2hc(const plan *ego_, R *I, R *O)
{
iterate((const P *) ego_, I, O, dobatch_r2hc);
}
static void apply_buf_hc2r(const plan *ego_, R *I, R *O)
{
iterate((const P *) ego_, I, O, dobatch_hc2r);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(stride_destroy)(ego->rs);
X(stride_destroy)(ego->csr);
X(stride_destroy)(ego->csi);
X(stride_destroy)(ego->brs);
X(stride_destroy)(ego->bcsr);
X(stride_destroy)(ego->bcsi);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->slv;
if (ego->slv->bufferedp)
p->print(p, "(rdft-%s-directbuf/%D-r2c-%D%v \"%s\")",
X(rdft_kind_str)(s->desc->genus->kind),
/* hack */ WS(ego->bcsr, 1), ego->n,
ego->vl, s->desc->nam);
else
p->print(p, "(rdft-%s-direct-r2c-%D%v \"%s\")",
X(rdft_kind_str)(s->desc->genus->kind), ego->n,
ego->vl, s->desc->nam);
}
static INT ioffset(rdft_kind kind, INT sz, INT s)
{
return(s * ((kind == R2HC || kind == HC2R) ? sz : (sz - 1)));
}
static int applicable(const solver *ego_, const problem *p_)
{
const S *ego = (const S *) ego_;
const kr2c_desc *desc = ego->desc;
const problem_rdft *p = (const problem_rdft *) p_;
INT vl, ivs, ovs;
return (
1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& p->sz->dims[0].n == desc->n
&& p->kind[0] == desc->genus->kind
/* check strides etc */
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
&& (0
/* can operate out-of-place */
|| p->I != p->O
/* computing one transform */
|| vl == 1
/* can operate in-place as long as strides are the same */
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
)
);
}
static int applicable_buf(const solver *ego_, const problem *p_)
{
const S *ego = (const S *) ego_;
const kr2c_desc *desc = ego->desc;
const problem_rdft *p = (const problem_rdft *) p_;
INT vl, ivs, ovs, batchsz;
return (
1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& p->sz->dims[0].n == desc->n
&& p->kind[0] == desc->genus->kind
/* check strides etc */
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
&& (batchsz = compute_batchsize(desc->n), 1)
&& (0
/* can operate out-of-place */
|| p->I != p->O
/* can operate in-place as long as strides are the same */
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
/* can do it if the problem fits in the buffer, no matter
what the strides are */
|| vl <= batchsz
)
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const problem_rdft *p;
iodim *d;
INT rs, cs, b, n;
static const plan_adt padt = {
X(rdft_solve), X(null_awake), print, destroy
};
UNUSED(plnr);
if (ego->bufferedp) {
if (!applicable_buf(ego_, p_))
return (plan *)0;
} else {
if (!applicable(ego_, p_))
return (plan *)0;
}
p = (const problem_rdft *) p_;
if (R2HC_KINDP(p->kind[0])) {
rs = p->sz->dims[0].is; cs = p->sz->dims[0].os;
pln = MKPLAN_RDFT(P, &padt,
ego->bufferedp ? apply_buf_r2hc : apply_r2hc);
} else {
rs = p->sz->dims[0].os; cs = p->sz->dims[0].is;
pln = MKPLAN_RDFT(P, &padt,
ego->bufferedp ? apply_buf_hc2r : apply_hc2r);
}
d = p->sz->dims;
n = d[0].n;
pln->k = ego->k;
pln->n = n;
pln->rs0 = rs;
pln->rs = X(mkstride)(n, 2 * rs);
pln->csr = X(mkstride)(n, cs);
pln->csi = X(mkstride)(n, -cs);
pln->ioffset = ioffset(p->kind[0], n, cs);
b = compute_batchsize(n);
pln->brs = X(mkstride)(n, 2 * b);
pln->bcsr = X(mkstride)(n, b);
pln->bcsi = X(mkstride)(n, -b);
pln->bioffset = ioffset(p->kind[0], n, b);
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
pln->slv = ego;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(pln->vl / ego->desc->genus->vl,
&ego->desc->ops,
&pln->super.super.ops);
if (ego->bufferedp)
pln->super.super.ops.other += 2 * n * pln->vl;
pln->super.super.could_prune_now_p = !ego->bufferedp;
return &(pln->super.super);
}
/* constructor */
static solver *mksolver(kr2c k, const kr2c_desc *desc, int bufferedp)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->k = k;
slv->desc = desc;
slv->bufferedp = bufferedp;
return &(slv->super);
}
solver *X(mksolver_rdft_r2c_direct)(kr2c k, const kr2c_desc *desc)
{
return mksolver(k, desc, 0);
}
solver *X(mksolver_rdft_r2c_directbuf)(kr2c k, const kr2c_desc *desc)
{
return mksolver(k, desc, 1);
}

View File

@@ -0,0 +1,145 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* direct RDFT solver, using r2r codelets */
#include "rdft/rdft.h"
typedef struct {
solver super;
const kr2r_desc *desc;
kr2r k;
} S;
typedef struct {
plan_rdft super;
INT vl, ivs, ovs;
stride is, os;
kr2r k;
const S *slv;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
ASSERT_ALIGNED_DOUBLE;
ego->k(I, O, ego->is, ego->os, ego->vl, ego->ivs, ego->ovs);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(stride_destroy)(ego->is);
X(stride_destroy)(ego->os);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->slv;
p->print(p, "(rdft-%s-direct-r2r-%D%v \"%s\")",
X(rdft_kind_str)(s->desc->kind), s->desc->n,
ego->vl, s->desc->nam);
}
static int applicable(const solver *ego_, const problem *p_)
{
const S *ego = (const S *) ego_;
const problem_rdft *p = (const problem_rdft *) p_;
INT vl;
INT ivs, ovs;
return (
1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& p->sz->dims[0].n == ego->desc->n
&& p->kind[0] == ego->desc->kind
/* check strides etc */
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
&& (0
/* can operate out-of-place */
|| p->I != p->O
/* computing one transform */
|| vl == 1
/* can operate in-place as long as strides are the same */
|| X(tensor_inplace_strides2)(p->sz, p->vecsz)
)
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const problem_rdft *p;
iodim *d;
static const plan_adt padt = {
X(rdft_solve), X(null_awake), print, destroy
};
UNUSED(plnr);
if (!applicable(ego_, p_))
return (plan *)0;
p = (const problem_rdft *) p_;
pln = MKPLAN_RDFT(P, &padt, apply);
d = p->sz->dims;
pln->k = ego->k;
pln->is = X(mkstride)(d->n, d->is);
pln->os = X(mkstride)(d->n, d->os);
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
pln->slv = ego;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(pln->vl / ego->desc->genus->vl,
&ego->desc->ops,
&pln->super.super.ops);
pln->super.super.could_prune_now_p = 1;
return &(pln->super.super);
}
/* constructor */
solver *X(mksolver_rdft_r2r_direct)(kr2r k, const kr2r_desc *desc)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->k = k;
slv->desc = desc;
return &(slv->super);
}

171
fftw-3.3.10/rdft/direct2.c Normal file
View File

@@ -0,0 +1,171 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* direct RDFT2 R2HC/HC2R solver, if we have a codelet */
#include "rdft/rdft.h"
typedef struct {
solver super;
const kr2c_desc *desc;
kr2c k;
} S;
typedef struct {
plan_rdft2 super;
stride rs, cs;
INT vl;
INT ivs, ovs;
kr2c k;
const S *slv;
INT ilast;
} P;
static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
ASSERT_ALIGNED_DOUBLE;
ego->k(r0, r1, cr, ci,
ego->rs, ego->cs, ego->cs,
ego->vl, ego->ivs, ego->ovs);
}
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
INT i, vl = ego->vl, ovs = ego->ovs;
ASSERT_ALIGNED_DOUBLE;
ego->k(r0, r1, cr, ci,
ego->rs, ego->cs, ego->cs,
vl, ego->ivs, ovs);
for (i = 0; i < vl; ++i, ci += ovs)
ci[0] = ci[ego->ilast] = 0;
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(stride_destroy)(ego->rs);
X(stride_destroy)(ego->cs);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->slv;
p->print(p, "(rdft2-%s-direct-%D%v \"%s\")",
X(rdft_kind_str)(s->desc->genus->kind), s->desc->n,
ego->vl, s->desc->nam);
}
static int applicable(const solver *ego_, const problem *p_)
{
const S *ego = (const S *) ego_;
const kr2c_desc *desc = ego->desc;
const problem_rdft2 *p = (const problem_rdft2 *) p_;
INT vl;
INT ivs, ovs;
return (
1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& p->sz->dims[0].n == desc->n
&& p->kind == desc->genus->kind
/* check strides etc */
&& X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs)
&& (0
/* can operate out-of-place */
|| p->r0 != p->cr
/*
* can compute one transform in-place, no matter
* what the strides are.
*/
|| p->vecsz->rnk == 0
/* can operate in-place as long as strides are the same */
|| X(rdft2_inplace_strides)(p, RNK_MINFTY)
)
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const problem_rdft2 *p;
iodim *d;
int r2hc_kindp;
static const plan_adt padt = {
X(rdft2_solve), X(null_awake), print, destroy
};
UNUSED(plnr);
if (!applicable(ego_, p_))
return (plan *)0;
p = (const problem_rdft2 *) p_;
r2hc_kindp = R2HC_KINDP(p->kind);
A(r2hc_kindp || HC2R_KINDP(p->kind));
pln = MKPLAN_RDFT2(P, &padt, p->kind == R2HC ? apply_r2hc : apply);
d = p->sz->dims;
pln->k = ego->k;
pln->rs = X(mkstride)(d->n, r2hc_kindp ? d->is : d->os);
pln->cs = X(mkstride)(d->n, r2hc_kindp ? d->os : d->is);
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
/* Nyquist freq., if any */
pln->ilast = (d->n % 2) ? 0 : (d->n/2) * d->os;
pln->slv = ego;
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(pln->vl / ego->desc->genus->vl,
&ego->desc->ops,
&pln->super.super.ops);
if (p->kind == R2HC)
pln->super.super.ops.other += 2 * pln->vl; /* + 2 stores */
pln->super.super.could_prune_now_p = 1;
return &(pln->super.super);
}
/* constructor */
solver *X(mksolver_rdft2_direct)(kr2c k, const kr2c_desc *desc)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->k = k;
slv->desc = desc;
return &(slv->super);
}

232
fftw-3.3.10/rdft/generic.c Normal file
View File

@@ -0,0 +1,232 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
typedef struct {
solver super;
rdft_kind kind;
} S;
typedef struct {
plan_rdft super;
twid *td;
INT n, is, os;
rdft_kind kind;
} P;
/***************************************************************************/
static void cdot_r2hc(INT n, const E *x, const R *w, R *or0, R *oi1)
{
INT i;
E rr = x[0], ri = 0;
x += 1;
for (i = 1; i + i < n; ++i) {
rr += x[0] * w[0];
ri += x[1] * w[1];
x += 2; w += 2;
}
*or0 = rr;
*oi1 = ri;
}
static void hartley_r2hc(INT n, const R *xr, INT xs, E *o, R *pr)
{
INT i;
E sr;
o[0] = sr = xr[0]; o += 1;
for (i = 1; i + i < n; ++i) {
R a, b;
a = xr[i * xs];
b = xr[(n - i) * xs];
sr += (o[0] = a + b);
#if FFT_SIGN == -1
o[1] = b - a;
#else
o[1] = a - b;
#endif
o += 2;
}
*pr = sr;
}
static void apply_r2hc(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT i;
INT n = ego->n, is = ego->is, os = ego->os;
const R *W = ego->td->W;
E *buf;
size_t bufsz = n * sizeof(E);
BUF_ALLOC(E *, buf, bufsz);
hartley_r2hc(n, I, is, buf, O);
for (i = 1; i + i < n; ++i) {
cdot_r2hc(n, buf, W, O + i * os, O + (n - i) * os);
W += n - 1;
}
BUF_FREE(buf, bufsz);
}
static void cdot_hc2r(INT n, const E *x, const R *w, R *or0, R *or1)
{
INT i;
E rr = x[0], ii = 0;
x += 1;
for (i = 1; i + i < n; ++i) {
rr += x[0] * w[0];
ii += x[1] * w[1];
x += 2; w += 2;
}
#if FFT_SIGN == -1
*or0 = rr - ii;
*or1 = rr + ii;
#else
*or0 = rr + ii;
*or1 = rr - ii;
#endif
}
static void hartley_hc2r(INT n, const R *x, INT xs, E *o, R *pr)
{
INT i;
E sr;
o[0] = sr = x[0]; o += 1;
for (i = 1; i + i < n; ++i) {
sr += (o[0] = x[i * xs] + x[i * xs]);
o[1] = x[(n - i) * xs] + x[(n - i) * xs];
o += 2;
}
*pr = sr;
}
static void apply_hc2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT i;
INT n = ego->n, is = ego->is, os = ego->os;
const R *W = ego->td->W;
E *buf;
size_t bufsz = n * sizeof(E);
BUF_ALLOC(E *, buf, bufsz);
hartley_hc2r(n, I, is, buf, O);
for (i = 1; i + i < n; ++i) {
cdot_hc2r(n, buf, W, O + i * os, O + (n - i) * os);
W += n - 1;
}
BUF_FREE(buf, bufsz);
}
/***************************************************************************/
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
static const tw_instr half_tw[] = {
{ TW_HALF, 1, 0 },
{ TW_NEXT, 1, 0 }
};
X(twiddle_awake)(wakefulness, &ego->td, half_tw, ego->n, ego->n,
(ego->n - 1) / 2);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(rdft-generic-%s-%D)",
ego->kind == R2HC ? "r2hc" : "hc2r",
ego->n);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& (p->sz->dims[0].n % 2) == 1
&& CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
&& X(is_prime)(p->sz->dims[0].n)
&& p->kind[0] == ego->kind
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *)ego_;
const problem_rdft *p;
P *pln;
INT n;
static const plan_adt padt = {
X(rdft_solve), awake, print, X(plan_null_destroy)
};
if (!applicable(ego, p_, plnr))
return (plan *)0;
p = (const problem_rdft *) p_;
pln = MKPLAN_RDFT(P, &padt,
R2HC_KINDP(p->kind[0]) ? apply_r2hc : apply_hc2r);
pln->n = n = p->sz->dims[0].n;
pln->is = p->sz->dims[0].is;
pln->os = p->sz->dims[0].os;
pln->td = 0;
pln->kind = ego->kind;
pln->super.super.ops.add = (n-1) * 2.5;
pln->super.super.ops.mul = 0;
pln->super.super.ops.fma = 0.5 * (n-1) * (n-1) ;
#if 0 /* these are nice pipelined sequential loads and should cost nothing */
pln->super.super.ops.other = (n-1)*(2 + 1 + (n-1)); /* approximate */
#endif
return &(pln->super.super);
}
static solver *mksolver(rdft_kind kind)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->kind = kind;
return &(slv->super);
}
void X(rdft_generic_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver(R2HC));
REGISTER_SOLVER(p, mksolver(HC2R));
}

View File

@@ -0,0 +1,279 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/hc2hc.h"
typedef struct {
hc2hc_solver super;
const hc2hc_desc *desc;
khc2hc k;
int bufferedp;
} S;
typedef struct {
plan_hc2hc super;
khc2hc k;
plan *cld0, *cldm; /* children for 0th and middle butterflies */
INT r, m, v;
INT ms, vs, mb, me;
stride rs, brs;
twid *td;
const S *slv;
} P;
/*************************************************************
Nonbuffered code
*************************************************************/
static void apply(const plan *ego_, R *IO)
{
const P *ego = (const P *) ego_;
plan_rdft *cld0 = (plan_rdft *) ego->cld0;
plan_rdft *cldm = (plan_rdft *) ego->cldm;
INT i, m = ego->m, v = ego->v;
INT mb = ego->mb, me = ego->me;
INT ms = ego->ms, vs = ego->vs;
for (i = 0; i < v; ++i, IO += vs) {
cld0->apply((plan *) cld0, IO, IO);
ego->k(IO + ms * mb, IO + (m - mb) * ms,
ego->td->W, ego->rs, mb, me, ms);
cldm->apply((plan *) cldm, IO + (m/2) * ms, IO + (m/2) * ms);
}
}
/*************************************************************
Buffered code
*************************************************************/
/* should not be 2^k to avoid associativity conflicts */
static INT compute_batchsize(INT radix)
{
/* round up to multiple of 4 */
radix += 3;
radix &= -4;
return (radix + 2);
}
static void dobatch(const P *ego, R *IOp, R *IOm,
INT mb, INT me, R *bufp)
{
INT b = WS(ego->brs, 1);
INT rs = WS(ego->rs, 1);
INT r = ego->r;
INT ms = ego->ms;
R *bufm = bufp + b - 1;
X(cpy2d_ci)(IOp + mb * ms, bufp, r, rs, b, me - mb, ms, 1, 1);
X(cpy2d_ci)(IOm - mb * ms, bufm, r, rs, b, me - mb, -ms, -1, 1);
ego->k(bufp, bufm, ego->td->W, ego->brs, mb, me, 1);
X(cpy2d_co)(bufp, IOp + mb * ms, r, b, rs, me - mb, 1, ms, 1);
X(cpy2d_co)(bufm, IOm - mb * ms, r, b, rs, me - mb, -1, -ms, 1);
}
static void apply_buf(const plan *ego_, R *IO)
{
const P *ego = (const P *) ego_;
plan_rdft *cld0 = (plan_rdft *) ego->cld0;
plan_rdft *cldm = (plan_rdft *) ego->cldm;
INT i, j, m = ego->m, v = ego->v, r = ego->r;
INT mb = ego->mb, me = ego->me, ms = ego->ms;
INT batchsz = compute_batchsize(r);
R *buf;
size_t bufsz = r * batchsz * 2 * sizeof(R);
BUF_ALLOC(R *, buf, bufsz);
for (i = 0; i < v; ++i, IO += ego->vs) {
R *IOp = IO;
R *IOm = IO + m * ms;
cld0->apply((plan *) cld0, IO, IO);
for (j = mb; j + batchsz < me; j += batchsz)
dobatch(ego, IOp, IOm, j, j + batchsz, buf);
dobatch(ego, IOp, IOm, j, me, buf);
cldm->apply((plan *) cldm, IO + ms * (m/2), IO + ms * (m/2));
}
BUF_FREE(buf, bufsz);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld0, wakefulness);
X(plan_awake)(ego->cldm, wakefulness);
X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
ego->r * ego->m, ego->r, (ego->m - 1) / 2);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld0);
X(plan_destroy_internal)(ego->cldm);
X(stride_destroy)(ego->rs);
X(stride_destroy)(ego->brs);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *slv = ego->slv;
const hc2hc_desc *e = slv->desc;
INT batchsz = compute_batchsize(ego->r);
if (slv->bufferedp)
p->print(p, "(hc2hc-directbuf/%D-%D/%D%v \"%s\"%(%p%)%(%p%))",
batchsz, ego->r, X(twiddle_length)(ego->r, e->tw),
ego->v, e->nam, ego->cld0, ego->cldm);
else
p->print(p, "(hc2hc-direct-%D/%D%v \"%s\"%(%p%)%(%p%))",
ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam,
ego->cld0, ego->cldm);
}
static int applicable0(const S *ego, rdft_kind kind, INT r)
{
const hc2hc_desc *e = ego->desc;
return (1
&& r == e->radix
&& kind == e->genus->kind
);
}
static int applicable(const S *ego, rdft_kind kind, INT r, INT m, INT v,
const planner *plnr)
{
if (!applicable0(ego, kind, r))
return 0;
if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
v, m * r, r))
return 0;
return 1;
}
#define CLDMP(m, mstart, mcount) (2 * ((mstart) + (mcount)) == (m) + 2)
#define CLD0P(mstart) ((mstart) == 0)
static plan *mkcldw(const hc2hc_solver *ego_,
rdft_kind kind, INT r, INT m, INT ms, INT v, INT vs,
INT mstart, INT mcount,
R *IO, planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
const hc2hc_desc *e = ego->desc;
plan *cld0 = 0, *cldm = 0;
INT imid = (m / 2) * ms;
INT rs = m * ms;
static const plan_adt padt = {
0, awake, print, destroy
};
if (!applicable(ego, kind, r, m, v, plnr))
return (plan *)0;
cld0 = X(mkplan_d)(
plnr,
X(mkproblem_rdft_1_d)((CLD0P(mstart) ?
X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
X(mktensor_0d)(),
TAINT(IO, vs), TAINT(IO, vs),
kind));
if (!cld0) goto nada;
cldm = X(mkplan_d)(
plnr,
X(mkproblem_rdft_1_d)((CLDMP(m, mstart, mcount) ?
X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
X(mktensor_0d)(),
TAINT(IO + imid, vs), TAINT(IO + imid, vs),
kind == R2HC ? R2HCII : HC2RIII));
if (!cldm) goto nada;
pln = MKPLAN_HC2HC(P, &padt, ego->bufferedp ? apply_buf : apply);
pln->k = ego->k;
pln->td = 0;
pln->r = r; pln->rs = X(mkstride)(r, rs);
pln->m = m; pln->ms = ms;
pln->v = v; pln->vs = vs;
pln->slv = ego;
pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
pln->cld0 = cld0;
pln->cldm = cldm;
pln->mb = mstart + CLD0P(mstart);
pln->me = mstart + mcount - CLDMP(m, mstart, mcount);
X(ops_zero)(&pln->super.super.ops);
X(ops_madd2)(v * ((pln->me - pln->mb) / e->genus->vl),
&e->ops, &pln->super.super.ops);
X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
if (ego->bufferedp)
pln->super.super.ops.other += 4 * r * (pln->me - pln->mb) * v;
pln->super.super.could_prune_now_p =
(!ego->bufferedp && r >= 5 && r < 64 && m >= r);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld0);
X(plan_destroy_internal)(cldm);
return 0;
}
static void regone(planner *plnr, khc2hc codelet, const hc2hc_desc *desc,
int bufferedp)
{
S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), desc->radix, mkcldw);
slv->k = codelet;
slv->desc = desc;
slv->bufferedp = bufferedp;
REGISTER_SOLVER(plnr, &(slv->super.super));
if (X(mksolver_hc2hc_hook)) {
slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), desc->radix, mkcldw);
slv->k = codelet;
slv->desc = desc;
slv->bufferedp = bufferedp;
REGISTER_SOLVER(plnr, &(slv->super.super));
}
}
void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet,
const hc2hc_desc *desc)
{
regone(plnr, codelet, desc, /* bufferedp */0);
regone(plnr, codelet, desc, /* bufferedp */1);
}

View File

@@ -0,0 +1,322 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* express a hc2hc problem in terms of rdft + multiplication by
twiddle factors */
#include "rdft/hc2hc.h"
typedef hc2hc_solver S;
typedef struct {
plan_hc2hc super;
INT r, m, s, vl, vs, mstart1, mcount1;
plan *cld0;
plan *cld;
twid *td;
} P;
/**************************************************************/
static void mktwiddle(P *ego, enum wakefulness wakefulness)
{
static const tw_instr tw[] = { { TW_HALF, 0, 0 }, { TW_NEXT, 1, 0 } };
/* note that R and M are swapped, to allow for sequential
access both to data and twiddles */
X(twiddle_awake)(wakefulness, &ego->td, tw,
ego->r * ego->m, ego->m, ego->r);
}
static void bytwiddle(const P *ego, R *IO, R sign)
{
INT i, j, k;
INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
INT ms = m * s;
INT mstart1 = ego->mstart1, mcount1 = ego->mcount1;
INT wrem = 2 * ((m-1)/2 - mcount1);
for (i = 0; i < vl; ++i, IO += vs) {
const R *W = ego->td->W;
A(m % 2 == 1);
for (k = 1, W += (m - 1) + 2*(mstart1-1); k < r; ++k) {
/* pr := IO + (j + mstart1) * s + k * ms */
R *pr = IO + mstart1 * s + k * ms;
/* pi := IO + (m - j - mstart1) * s + k * ms */
R *pi = IO - mstart1 * s + (k + 1) * ms;
for (j = 0; j < mcount1; ++j, pr += s, pi -= s) {
E xr = *pr;
E xi = *pi;
E wr = W[0];
E wi = sign * W[1];
*pr = xr * wr - xi * wi;
*pi = xi * wr + xr * wi;
W += 2;
}
W += wrem;
}
}
}
static void swapri(R *IO, INT r, INT m, INT s, INT jstart, INT jend)
{
INT k;
INT ms = m * s;
INT js = jstart * s;
for (k = 0; k + k < r; ++k) {
/* pr := IO + (m - j) * s + k * ms */
R *pr = IO + (k + 1) * ms - js;
/* pi := IO + (m - j) * s + (r - 1 - k) * ms */
R *pi = IO + (r - k) * ms - js;
INT j;
for (j = jstart; j < jend; j += 1, pr -= s, pi -= s) {
R t = *pr;
*pr = *pi;
*pi = t;
}
}
}
static void reorder_dit(const P *ego, R *IO)
{
INT i, k;
INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
INT ms = m * s;
INT mstart1 = ego->mstart1, mend1 = mstart1 + ego->mcount1;
for (i = 0; i < vl; ++i, IO += vs) {
for (k = 1; k + k < r; ++k) {
R *p0 = IO + k * ms;
R *p1 = IO + (r - k) * ms;
INT j;
for (j = mstart1; j < mend1; ++j) {
E rp, ip, im, rm;
rp = p0[j * s];
im = p1[ms - j * s];
rm = p1[j * s];
ip = p0[ms - j * s];
p0[j * s] = rp - im;
p1[ms - j * s] = rp + im;
p1[j * s] = rm - ip;
p0[ms - j * s] = ip + rm;
}
}
swapri(IO, r, m, s, mstart1, mend1);
}
}
static void reorder_dif(const P *ego, R *IO)
{
INT i, k;
INT r = ego->r, m = ego->m, s = ego->s, vl = ego->vl, vs = ego->vs;
INT ms = m * s;
INT mstart1 = ego->mstart1, mend1 = mstart1 + ego->mcount1;
for (i = 0; i < vl; ++i, IO += vs) {
swapri(IO, r, m, s, mstart1, mend1);
for (k = 1; k + k < r; ++k) {
R *p0 = IO + k * ms;
R *p1 = IO + (r - k) * ms;
const R half = K(0.5);
INT j;
for (j = mstart1; j < mend1; ++j) {
E rp, ip, im, rm;
rp = half * p0[j * s];
im = half * p1[ms - j * s];
rm = half * p1[j * s];
ip = half * p0[ms - j * s];
p0[j * s] = rp + im;
p1[ms - j * s] = im - rp;
p1[j * s] = rm + ip;
p0[ms - j * s] = ip - rm;
}
}
}
}
static int applicable(rdft_kind kind, INT r, INT m, const planner *plnr)
{
return (1
&& (kind == R2HC || kind == HC2R)
&& (m % 2)
&& (r % 2)
&& !NO_SLOWP(plnr)
);
}
/**************************************************************/
static void apply_dit(const plan *ego_, R *IO)
{
const P *ego = (const P *) ego_;
INT start;
plan_rdft *cld, *cld0;
bytwiddle(ego, IO, K(-1.0));
cld0 = (plan_rdft *) ego->cld0;
cld0->apply(ego->cld0, IO, IO);
start = ego->mstart1 * ego->s;
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, IO + start, IO + start);
reorder_dit(ego, IO);
}
static void apply_dif(const plan *ego_, R *IO)
{
const P *ego = (const P *) ego_;
INT start;
plan_rdft *cld, *cld0;
reorder_dif(ego, IO);
cld0 = (plan_rdft *) ego->cld0;
cld0->apply(ego->cld0, IO, IO);
start = ego->mstart1 * ego->s;
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, IO + start, IO + start);
bytwiddle(ego, IO, K(1.0));
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld0, wakefulness);
X(plan_awake)(ego->cld, wakefulness);
mktwiddle(ego, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
X(plan_destroy_internal)(ego->cld0);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(hc2hc-generic-%s-%D-%D%v%(%p%)%(%p%))",
ego->super.apply == apply_dit ? "dit" : "dif",
ego->r, ego->m, ego->vl, ego->cld0, ego->cld);
}
static plan *mkcldw(const hc2hc_solver *ego_,
rdft_kind kind, INT r, INT m, INT s, INT vl, INT vs,
INT mstart, INT mcount,
R *IO, planner *plnr)
{
P *pln;
plan *cld0 = 0, *cld = 0;
INT mstart1, mcount1, mstride;
static const plan_adt padt = {
0, awake, print, destroy
};
UNUSED(ego_);
A(mstart >= 0 && mcount > 0 && mstart + mcount <= (m+2)/2);
if (!applicable(kind, r, m, plnr))
return (plan *)0;
A(m % 2);
mstart1 = mstart + (mstart == 0);
mcount1 = mcount - (mstart == 0);
mstride = m - (mstart + mcount - 1) - mstart1;
/* 0th (DC) transform (vl of these), if mstart == 0 */
cld0 = X(mkplan_d)(plnr,
X(mkproblem_rdft_1_d)(
mstart == 0 ? X(mktensor_1d)(r, m * s, m * s)
: X(mktensor_0d)(),
X(mktensor_1d)(vl, vs, vs),
IO, IO, kind)
);
if (!cld0) goto nada;
/* twiddle transforms: there are 2 x mcount1 x vl of these
(where 2 corresponds to the real and imaginary parts) ...
the 2 x mcount1 loops are combined if mstart=0 and mcount=(m+2)/2. */
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_1_d)(
X(mktensor_1d)(r, m * s, m * s),
X(mktensor_3d)(2, mstride * s, mstride * s,
mcount1, s, s,
vl, vs, vs),
IO + s * mstart1, IO + s * mstart1, kind)
);
if (!cld) goto nada;
pln = MKPLAN_HC2HC(P, &padt, (kind == R2HC) ? apply_dit : apply_dif);
pln->cld = cld;
pln->cld0 = cld0;
pln->r = r;
pln->m = m;
pln->s = s;
pln->vl = vl;
pln->vs = vs;
pln->td = 0;
pln->mstart1 = mstart1;
pln->mcount1 = mcount1;
{
double n0 = 0.5 * (r - 1) * (2 * mcount1) * vl;
pln->super.super.ops = cld->ops;
pln->super.super.ops.mul += (kind == R2HC ? 5.0 : 7.0) * n0;
pln->super.super.ops.add += 4.0 * n0;
pln->super.super.ops.other += 11.0 * n0;
}
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld);
X(plan_destroy_internal)(cld0);
return (plan *) 0;
}
static void regsolver(planner *plnr, INT r)
{
S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), r, mkcldw);
REGISTER_SOLVER(plnr, &(slv->super));
if (X(mksolver_hc2hc_hook)) {
slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), r, mkcldw);
REGISTER_SOLVER(plnr, &(slv->super));
}
}
void X(hc2hc_generic_register)(planner *p)
{
regsolver(p, 0);
}

214
fftw-3.3.10/rdft/hc2hc.c Normal file
View File

@@ -0,0 +1,214 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/hc2hc.h"
hc2hc_solver *(*X(mksolver_hc2hc_hook))(size_t, INT, hc2hc_mkinferior) = 0;
typedef struct {
plan_rdft super;
plan *cld;
plan *cldw;
INT r;
} P;
static void apply_dit(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld;
plan_hc2hc *cldw;
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, I, O);
cldw = (plan_hc2hc *) ego->cldw;
cldw->apply(ego->cldw, O);
}
static void apply_dif(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld;
plan_hc2hc *cldw;
cldw = (plan_hc2hc *) ego->cldw;
cldw->apply(ego->cldw, I);
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, I, O);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldw, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldw);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(rdft-ct-%s/%D%(%p%)%(%p%))",
ego->super.apply == apply_dit ? "dit" : "dif",
ego->r, ego->cldw, ego->cld);
}
static int applicable0(const hc2hc_solver *ego, const problem *p_, planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
INT r;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk <= 1
&& (/* either the problem is R2HC, which is solved by DIT */
(p->kind[0] == R2HC)
||
/* or the problem is HC2R, in which case it is solved
by DIF, which destroys the input */
(p->kind[0] == HC2R &&
(p->I == p->O || !NO_DESTROY_INPUTP(plnr))))
&& ((r = X(choose_radix)(ego->r, p->sz->dims[0].n)) > 0)
&& p->sz->dims[0].n > r);
}
int X(hc2hc_applicable)(const hc2hc_solver *ego, const problem *p_, planner *plnr)
{
const problem_rdft *p;
if (!applicable0(ego, p_, plnr))
return 0;
p = (const problem_rdft *) p_;
return (0
|| p->vecsz->rnk == 0
|| !NO_VRECURSEP(plnr)
);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const hc2hc_solver *ego = (const hc2hc_solver *) ego_;
const problem_rdft *p;
P *pln = 0;
plan *cld = 0, *cldw = 0;
INT n, r, m, v, ivs, ovs;
iodim *d;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (NO_NONTHREADEDP(plnr) || !X(hc2hc_applicable)(ego, p_, plnr))
return (plan *) 0;
p = (const problem_rdft *) p_;
d = p->sz->dims;
n = d[0].n;
r = X(choose_radix)(ego->r, n);
m = n / r;
X(tensor_tornk1)(p->vecsz, &v, &ivs, &ovs);
switch (p->kind[0]) {
case R2HC:
cldw = ego->mkcldw(ego,
R2HC, r, m, d[0].os, v, ovs, 0, (m+2)/2,
p->O, plnr);
if (!cldw) goto nada;
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(
X(mktensor_1d)(m, r * d[0].is, d[0].os),
X(mktensor_2d)(r, d[0].is, m * d[0].os,
v, ivs, ovs),
p->I, p->O, p->kind)
);
if (!cld) goto nada;
pln = MKPLAN_RDFT(P, &padt, apply_dit);
break;
case HC2R:
cldw = ego->mkcldw(ego,
HC2R, r, m, d[0].is, v, ivs, 0, (m+2)/2,
p->I, plnr);
if (!cldw) goto nada;
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(
X(mktensor_1d)(m, d[0].is, r * d[0].os),
X(mktensor_2d)(r, m * d[0].is, d[0].os,
v, ivs, ovs),
p->I, p->O, p->kind)
);
if (!cld) goto nada;
pln = MKPLAN_RDFT(P, &padt, apply_dif);
break;
default:
A(0);
}
pln->cld = cld;
pln->cldw = cldw;
pln->r = r;
X(ops_add)(&cld->ops, &cldw->ops, &pln->super.super.ops);
/* inherit could_prune_now_p attribute from cldw */
pln->super.super.could_prune_now_p = cldw->could_prune_now_p;
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldw);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
hc2hc_solver *X(mksolver_hc2hc)(size_t size, INT r, hc2hc_mkinferior mkcldw)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
hc2hc_solver *slv = (hc2hc_solver *)X(mksolver)(size, &sadt);
slv->r = r;
slv->mkcldw = mkcldw;
return slv;
}
plan *X(mkplan_hc2hc)(size_t size, const plan_adt *adt, hc2hcapply apply)
{
plan_hc2hc *ego;
ego = (plan_hc2hc *) X(mkplan)(size, adt);
ego->apply = apply;
return &(ego->super);
}

54
fftw-3.3.10/rdft/hc2hc.h Normal file
View File

@@ -0,0 +1,54 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
typedef void (*hc2hcapply) (const plan *ego, R *IO);
typedef struct hc2hc_solver_s hc2hc_solver;
typedef plan *(*hc2hc_mkinferior)(const hc2hc_solver *ego,
rdft_kind kind, INT r, INT m, INT s,
INT vl, INT vs, INT mstart, INT mcount,
R *IO, planner *plnr);
typedef struct {
plan super;
hc2hcapply apply;
} plan_hc2hc;
extern plan *X(mkplan_hc2hc)(size_t size, const plan_adt *adt,
hc2hcapply apply);
#define MKPLAN_HC2HC(type, adt, apply) \
(type *)X(mkplan_hc2hc)(sizeof(type), adt, apply)
struct hc2hc_solver_s {
solver super;
INT r;
hc2hc_mkinferior mkcldw;
};
hc2hc_solver *X(mksolver_hc2hc)(size_t size, INT r, hc2hc_mkinferior mkcldw);
extern hc2hc_solver *(*X(mksolver_hc2hc_hook))(size_t, INT, hc2hc_mkinferior);
void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet,
const hc2hc_desc *desc);
int X(hc2hc_applicable)(const hc2hc_solver *, const problem *, planner *);

234
fftw-3.3.10/rdft/indirect.c Normal file
View File

@@ -0,0 +1,234 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* solvers/plans for vectors of small RDFT's that cannot be done
in-place directly. Use a rank-0 plan to rearrange the data
before or after the transform. Can also change an out-of-place
plan into a copy + in-place (where the in-place transform
is e.g. unit stride). */
/* FIXME: merge with rank-geq2.c(?), since this is just a special case
of a rank split where the first/second transform has rank 0. */
#include "rdft/rdft.h"
typedef problem *(*mkcld_t) (const problem_rdft *p);
typedef struct {
rdftapply apply;
problem *(*mkcld)(const problem_rdft *p);
const char *nam;
} ndrct_adt;
typedef struct {
solver super;
const ndrct_adt *adt;
} S;
typedef struct {
plan_rdft super;
plan *cldcpy, *cld;
const S *slv;
} P;
/*-----------------------------------------------------------------------*/
/* first rearrange, then transform */
static void apply_before(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
{
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
cldcpy->apply(ego->cldcpy, I, O);
}
{
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, O, O);
}
}
static problem *mkcld_before(const problem_rdft *p)
{
return X(mkproblem_rdft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_OS),
X(tensor_copy_inplace)(p->vecsz, INPLACE_OS),
p->O, p->O, p->kind);
}
static const ndrct_adt adt_before =
{
apply_before, mkcld_before, "rdft-indirect-before"
};
/*-----------------------------------------------------------------------*/
/* first transform, then rearrange */
static void apply_after(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
{
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, I, I);
}
{
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
cldcpy->apply(ego->cldcpy, I, O);
}
}
static problem *mkcld_after(const problem_rdft *p)
{
return X(mkproblem_rdft_d)(X(tensor_copy_inplace)(p->sz, INPLACE_IS),
X(tensor_copy_inplace)(p->vecsz, INPLACE_IS),
p->I, p->I, p->kind);
}
static const ndrct_adt adt_after =
{
apply_after, mkcld_after, "rdft-indirect-after"
};
/*-----------------------------------------------------------------------*/
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
X(plan_destroy_internal)(ego->cldcpy);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldcpy, wakefulness);
X(plan_awake)(ego->cld, wakefulness);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->slv;
p->print(p, "(%s%(%p%)%(%p%))", s->adt->nam, ego->cld, ego->cldcpy);
}
static int applicable0(const solver *ego_, const problem *p_,
const planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_rdft *p = (const problem_rdft *) p_;
return (1
&& FINITE_RNK(p->vecsz->rnk)
/* problem must be a nontrivial transform, not just a copy */
&& p->sz->rnk > 0
&& (0
/* problem must be in-place & require some
rearrangement of the data */
|| (p->I == p->O
&& !(X(tensor_inplace_strides2)(p->sz, p->vecsz)))
/* or problem must be out of place, transforming
from stride 1/2 to bigger stride, for apply_after */
|| (p->I != p->O && ego->adt->apply == apply_after
&& !NO_DESTROY_INPUTP(plnr)
&& X(tensor_min_istride)(p->sz) <= 2
&& X(tensor_min_ostride)(p->sz) > 2)
/* or problem must be out of place, transforming
to stride 1/2 from bigger stride, for apply_before */
|| (p->I != p->O && ego->adt->apply == apply_before
&& X(tensor_min_ostride)(p->sz) <= 2
&& X(tensor_min_istride)(p->sz) > 2)
)
);
}
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr)
{
if (!applicable0(ego_, p_, plnr)) return 0;
if (NO_INDIRECT_OP_P(plnr)) {
const problem_rdft *p = (const problem_rdft *)p_;
if (p->I != p->O) return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
const S *ego = (const S *) ego_;
P *pln;
plan *cld = 0, *cldcpy = 0;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr))
return (plan *) 0;
cldcpy = X(mkplan_d)(plnr,
X(mkproblem_rdft_0_d)(
X(tensor_append)(p->vecsz, p->sz),
p->I, p->O));
if (!cldcpy) goto nada;
cld = X(mkplan_f_d)(plnr, ego->adt->mkcld(p), NO_BUFFERING, 0, 0);
if (!cld) goto nada;
pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
pln->cld = cld;
pln->cldcpy = cldcpy;
pln->slv = ego;
X(ops_add)(&cld->ops, &cldcpy->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld);
X(plan_destroy_internal)(cldcpy);
return (plan *)0;
}
static solver *mksolver(const ndrct_adt *adt)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->adt = adt;
return &(slv->super);
}
void X(rdft_indirect_register)(planner *p)
{
unsigned i;
static const ndrct_adt *const adts[] = {
&adt_before, &adt_after
};
for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
REGISTER_SOLVER(p, mksolver(adts[i]));
}

28
fftw-3.3.10/rdft/khc2c.c Normal file
View File

@@ -0,0 +1,28 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ct-hc2c.h"
void X(khc2c_register)(planner *p, khc2c codelet, const hc2c_desc *desc,
hc2c_kind hc2ckind)
{
X(regsolver_hc2c_direct)(p, codelet, desc, hc2ckind);
}

27
fftw-3.3.10/rdft/khc2hc.c Normal file
View File

@@ -0,0 +1,27 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/hc2hc.h"
void X(khc2hc_register)(planner *p, khc2hc codelet, const hc2hc_desc *desc)
{
X(regsolver_hc2hc_direct)(p, codelet, desc);
}

29
fftw-3.3.10/rdft/kr2c.c Normal file
View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
void X(kr2c_register)(planner *p, kr2c codelet, const kr2c_desc *desc)
{
REGISTER_SOLVER(p, X(mksolver_rdft_r2c_direct)(codelet, desc));
REGISTER_SOLVER(p, X(mksolver_rdft_r2c_directbuf)(codelet, desc));
REGISTER_SOLVER(p, X(mksolver_rdft2_direct)(codelet, desc));
}

27
fftw-3.3.10/rdft/kr2r.c Normal file
View File

@@ -0,0 +1,27 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
void X(kr2r_register)(planner *p, kr2r codelet, const kr2r_desc *desc)
{
REGISTER_SOLVER(p, X(mksolver_rdft_r2r_direct)(codelet, desc));
}

82
fftw-3.3.10/rdft/nop.c Normal file
View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for vrank -infty RDFTs (nothing to do) */
#include "rdft/rdft.h"
static void apply(const plan *ego_, R *I, R *O)
{
UNUSED(ego_);
UNUSED(I);
UNUSED(O);
}
static int applicable(const solver *ego_, const problem *p_)
{
const problem_rdft *p = (const problem_rdft *) p_;
UNUSED(ego_);
return 0
/* case 1 : -infty vector rank */
|| (p->vecsz->rnk == RNK_MINFTY)
/* case 2 : rank-0 in-place rdft */
|| (1
&& p->sz->rnk == 0
&& FINITE_RNK(p->vecsz->rnk)
&& p->O == p->I
&& X(tensor_inplace_strides)(p->vecsz)
);
}
static void print(const plan *ego, printer *p)
{
UNUSED(ego);
p->print(p, "(rdft-nop)");
}
static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
{
static const plan_adt padt = {
X(rdft_solve), X(null_awake), print, X(plan_null_destroy)
};
plan_rdft *pln;
UNUSED(plnr);
if (!applicable(ego, p))
return (plan *) 0;
pln = MKPLAN_RDFT(plan_rdft, &padt, apply);
X(ops_zero)(&pln->super.ops);
return &(pln->super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
return MKSOLVER(solver, &sadt);
}
void X(rdft_nop_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

90
fftw-3.3.10/rdft/nop2.c Normal file
View File

@@ -0,0 +1,90 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for vrank -infty RDFT2s (nothing to do), as well as in-place
rank-0 HC2R. Note that in-place rank-0 R2HC is *not* a no-op, because
we have to set the imaginary parts of the output to zero. */
#include "rdft/rdft.h"
static void apply(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
UNUSED(ego_);
UNUSED(r0);
UNUSED(r1);
UNUSED(cr);
UNUSED(ci);
}
static int applicable(const solver *ego_, const problem *p_)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
UNUSED(ego_);
return(0
/* case 1 : -infty vector rank */
|| (p->vecsz->rnk == RNK_MINFTY)
/* case 2 : rank-0 in-place rdft, except that
R2HC is not a no-op because it sets the imaginary
part to 0 */
|| (1
&& p->kind != R2HC
&& p->sz->rnk == 0
&& FINITE_RNK(p->vecsz->rnk)
&& (p->r0 == p->cr)
&& X(rdft2_inplace_strides)(p, RNK_MINFTY)
));
}
static void print(const plan *ego, printer *p)
{
UNUSED(ego);
p->print(p, "(rdft2-nop)");
}
static plan *mkplan(const solver *ego, const problem *p, planner *plnr)
{
static const plan_adt padt = {
X(rdft2_solve), X(null_awake), print, X(plan_null_destroy)
};
plan_rdft2 *pln;
UNUSED(plnr);
if (!applicable(ego, p))
return (plan *) 0;
pln = MKPLAN_RDFT2(plan_rdft2, &padt, apply);
X(ops_zero)(&pln->super.ops);
return &(pln->super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
return MKSOLVER(solver, &sadt);
}
void X(rdft2_nop_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

32
fftw-3.3.10/rdft/plan.c Normal file
View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
plan *X(mkplan_rdft)(size_t size, const plan_adt *adt, rdftapply apply)
{
plan_rdft *ego;
ego = (plan_rdft *) X(mkplan)(size, adt);
ego->apply = apply;
return &(ego->super);
}

32
fftw-3.3.10/rdft/plan2.c Normal file
View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
plan *X(mkplan_rdft2)(size_t size, const plan_adt *adt, rdft2apply apply)
{
plan_rdft2 *ego;
ego = (plan_rdft2 *) X(mkplan)(size, adt);
ego->apply = apply;
return &(ego->super);
}

238
fftw-3.3.10/rdft/problem.c Normal file
View File

@@ -0,0 +1,238 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
#include <stddef.h>
static void destroy(problem *ego_)
{
problem_rdft *ego = (problem_rdft *) ego_;
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
X(ifree0)(ego->kind);
#endif
X(tensor_destroy2)(ego->vecsz, ego->sz);
X(ifree)(ego_);
}
static void kind_hash(md5 *m, const rdft_kind *kind, int rnk)
{
int i;
for (i = 0; i < rnk; ++i)
X(md5int)(m, kind[i]);
}
static void hash(const problem *p_, md5 *m)
{
const problem_rdft *p = (const problem_rdft *) p_;
X(md5puts)(m, "rdft");
X(md5int)(m, p->I == p->O);
kind_hash(m, p->kind, p->sz->rnk);
X(md5int)(m, X(ialignment_of)(p->I));
X(md5int)(m, X(ialignment_of)(p->O));
X(tensor_md5)(m, p->sz);
X(tensor_md5)(m, p->vecsz);
}
static void recur(const iodim *dims, int rnk, R *I)
{
if (rnk == RNK_MINFTY)
return;
else if (rnk == 0)
I[0] = K(0.0);
else if (rnk > 0) {
INT i, n = dims[0].n, is = dims[0].is;
if (rnk == 1) {
/* this case is redundant but faster */
for (i = 0; i < n; ++i)
I[i * is] = K(0.0);
} else {
for (i = 0; i < n; ++i)
recur(dims + 1, rnk - 1, I + i * is);
}
}
}
void X(rdft_zerotens)(tensor *sz, R *I)
{
recur(sz->dims, sz->rnk, I);
}
#define KSTR_LEN 8
const char *X(rdft_kind_str)(rdft_kind kind)
{
static const char kstr[][KSTR_LEN] = {
"r2hc", "r2hc01", "r2hc10", "r2hc11",
"hc2r", "hc2r01", "hc2r10", "hc2r11",
"dht",
"redft00", "redft01", "redft10", "redft11",
"rodft00", "rodft01", "rodft10", "rodft11"
};
A(kind >= 0 && kind < sizeof(kstr) / KSTR_LEN);
return kstr[kind];
}
static void print(const problem *ego_, printer *p)
{
const problem_rdft *ego = (const problem_rdft *) ego_;
int i;
p->print(p, "(rdft %d %D %T %T",
X(ialignment_of)(ego->I),
(INT)(ego->O - ego->I),
ego->sz,
ego->vecsz);
for (i = 0; i < ego->sz->rnk; ++i)
p->print(p, " %d", (int)ego->kind[i]);
p->print(p, ")");
}
static void zero(const problem *ego_)
{
const problem_rdft *ego = (const problem_rdft *) ego_;
tensor *sz = X(tensor_append)(ego->vecsz, ego->sz);
X(rdft_zerotens)(sz, UNTAINT(ego->I));
X(tensor_destroy)(sz);
}
static const problem_adt padt =
{
PROBLEM_RDFT,
hash,
zero,
print,
destroy
};
/* Dimensions of size 1 that are not REDFT/RODFT are no-ops and can be
eliminated. REDFT/RODFT unit dimensions often have factors of 2.0
and suchlike from normalization and phases, although in principle
these constant factors from different dimensions could be combined. */
static int nontrivial(const iodim *d, rdft_kind kind)
{
return (d->n > 1 || kind == R2HC11 || kind == HC2R11
|| (REODFT_KINDP(kind) && kind != REDFT01 && kind != RODFT01));
}
problem *X(mkproblem_rdft)(const tensor *sz, const tensor *vecsz,
R *I, R *O, const rdft_kind *kind)
{
problem_rdft *ego;
int rnk = sz->rnk;
int i;
A(X(tensor_kosherp)(sz));
A(X(tensor_kosherp)(vecsz));
A(FINITE_RNK(sz->rnk));
if (UNTAINT(I) == UNTAINT(O))
I = O = JOIN_TAINT(I, O);
if (I == O && !X(tensor_inplace_locations)(sz, vecsz))
return X(mkproblem_unsolvable)();
for (i = rnk = 0; i < sz->rnk; ++i) {
A(sz->dims[i].n > 0);
if (nontrivial(sz->dims + i, kind[i]))
++rnk;
}
#if defined(STRUCT_HACK_KR)
ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft)
+ sizeof(rdft_kind)
* (rnk > 0 ? rnk - 1u : 0u), &padt);
#elif defined(STRUCT_HACK_C99)
ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft)
+ sizeof(rdft_kind) * (unsigned)rnk, &padt);
#else
ego = (problem_rdft *) X(mkproblem)(sizeof(problem_rdft), &padt);
ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * (unsigned)rnk, PROBLEMS);
#endif
/* do compression and sorting as in X(tensor_compress), but take
transform kind into account (sigh) */
ego->sz = X(mktensor)(rnk);
for (i = rnk = 0; i < sz->rnk; ++i) {
if (nontrivial(sz->dims + i, kind[i])) {
ego->kind[rnk] = kind[i];
ego->sz->dims[rnk++] = sz->dims[i];
}
}
for (i = 0; i + 1 < rnk; ++i) {
int j;
for (j = i + 1; j < rnk; ++j)
if (X(dimcmp)(ego->sz->dims + i, ego->sz->dims + j) > 0) {
iodim dswap;
rdft_kind kswap;
dswap = ego->sz->dims[i];
ego->sz->dims[i] = ego->sz->dims[j];
ego->sz->dims[j] = dswap;
kswap = ego->kind[i];
ego->kind[i] = ego->kind[j];
ego->kind[j] = kswap;
}
}
for (i = 0; i < rnk; ++i)
if (ego->sz->dims[i].n == 2 && (ego->kind[i] == REDFT00
|| ego->kind[i] == DHT
|| ego->kind[i] == HC2R))
ego->kind[i] = R2HC; /* size-2 transforms are equivalent */
ego->vecsz = X(tensor_compress_contiguous)(vecsz);
ego->I = I;
ego->O = O;
A(FINITE_RNK(ego->sz->rnk));
return &(ego->super);
}
/* Same as X(mkproblem_rdft), but also destroy input tensors. */
problem *X(mkproblem_rdft_d)(tensor *sz, tensor *vecsz,
R *I, R *O, const rdft_kind *kind)
{
problem *p = X(mkproblem_rdft)(sz, vecsz, I, O, kind);
X(tensor_destroy2)(vecsz, sz);
return p;
}
/* As above, but for rnk <= 1 only and takes a scalar kind parameter */
problem *X(mkproblem_rdft_1)(const tensor *sz, const tensor *vecsz,
R *I, R *O, rdft_kind kind)
{
A(sz->rnk <= 1);
return X(mkproblem_rdft)(sz, vecsz, I, O, &kind);
}
problem *X(mkproblem_rdft_1_d)(tensor *sz, tensor *vecsz,
R *I, R *O, rdft_kind kind)
{
A(sz->rnk <= 1);
return X(mkproblem_rdft_d)(sz, vecsz, I, O, &kind);
}
/* create a zero-dimensional problem */
problem *X(mkproblem_rdft_0_d)(tensor *vecsz, R *I, R *O)
{
return X(mkproblem_rdft_d)(X(mktensor_0d)(), vecsz, I, O,
(const rdft_kind *)0);
}

224
fftw-3.3.10/rdft/problem2.c Normal file
View File

@@ -0,0 +1,224 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "dft/dft.h"
#include "rdft/rdft.h"
#include <stddef.h>
static void destroy(problem *ego_)
{
problem_rdft2 *ego = (problem_rdft2 *) ego_;
X(tensor_destroy2)(ego->vecsz, ego->sz);
X(ifree)(ego_);
}
static void hash(const problem *p_, md5 *m)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
X(md5puts)(m, "rdft2");
X(md5int)(m, p->r0 == p->cr);
X(md5INT)(m, p->r1 - p->r0);
X(md5INT)(m, p->ci - p->cr);
X(md5int)(m, X(ialignment_of)(p->r0));
X(md5int)(m, X(ialignment_of)(p->r1));
X(md5int)(m, X(ialignment_of)(p->cr));
X(md5int)(m, X(ialignment_of)(p->ci));
X(md5int)(m, p->kind);
X(tensor_md5)(m, p->sz);
X(tensor_md5)(m, p->vecsz);
}
static void print(const problem *ego_, printer *p)
{
const problem_rdft2 *ego = (const problem_rdft2 *) ego_;
p->print(p, "(rdft2 %d %d %T %T)",
(int)(ego->cr == ego->r0),
(int)(ego->kind),
ego->sz,
ego->vecsz);
}
static void recur(const iodim *dims, int rnk, R *I0, R *I1)
{
if (rnk == RNK_MINFTY)
return;
else if (rnk == 0)
I0[0] = K(0.0);
else if (rnk > 0) {
INT i, n = dims[0].n, is = dims[0].is;
if (rnk == 1) {
for (i = 0; i < n - 1; i += 2) {
*I0 = *I1 = K(0.0);
I0 += is; I1 += is;
}
if (i < n)
*I0 = K(0.0);
} else {
for (i = 0; i < n; ++i)
recur(dims + 1, rnk - 1, I0 + i * is, I1 + i * is);
}
}
}
static void vrecur(const iodim *vdims, int vrnk,
const iodim *dims, int rnk, R *I0, R *I1)
{
if (vrnk == RNK_MINFTY)
return;
else if (vrnk == 0)
recur(dims, rnk, I0, I1);
else if (vrnk > 0) {
INT i, n = vdims[0].n, is = vdims[0].is;
for (i = 0; i < n; ++i)
vrecur(vdims + 1, vrnk - 1,
dims, rnk, I0 + i * is, I1 + i * is);
}
}
INT X(rdft2_complex_n)(INT real_n, rdft_kind kind)
{
switch (kind) {
case R2HC:
case HC2R:
return (real_n / 2) + 1;
case R2HCII:
case HC2RIII:
return (real_n + 1) / 2;
default:
/* can't happen */
A(0);
return 0;
}
}
static void zero(const problem *ego_)
{
const problem_rdft2 *ego = (const problem_rdft2 *) ego_;
if (R2HC_KINDP(ego->kind)) {
/* FIXME: can we avoid the double recursion somehow? */
vrecur(ego->vecsz->dims, ego->vecsz->rnk,
ego->sz->dims, ego->sz->rnk,
UNTAINT(ego->r0), UNTAINT(ego->r1));
} else {
tensor *sz;
tensor *sz2 = X(tensor_copy)(ego->sz);
int rnk = sz2->rnk;
if (rnk > 0) /* ~half as many complex outputs */
sz2->dims[rnk-1].n =
X(rdft2_complex_n)(sz2->dims[rnk-1].n, ego->kind);
sz = X(tensor_append)(ego->vecsz, sz2);
X(tensor_destroy)(sz2);
X(dft_zerotens)(sz, UNTAINT(ego->cr), UNTAINT(ego->ci));
X(tensor_destroy)(sz);
}
}
static const problem_adt padt =
{
PROBLEM_RDFT2,
hash,
zero,
print,
destroy
};
problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz,
R *r0, R *r1, R *cr, R *ci,
rdft_kind kind)
{
problem_rdft2 *ego;
A(kind == R2HC || kind == R2HCII || kind == HC2R || kind == HC2RIII);
A(X(tensor_kosherp)(sz));
A(X(tensor_kosherp)(vecsz));
A(FINITE_RNK(sz->rnk));
/* require in-place problems to use r0 == cr */
if (UNTAINT(r0) == UNTAINT(ci))
return X(mkproblem_unsolvable)();
/* FIXME: should check UNTAINT(r1) == UNTAINT(cr) but
only if odd elements exist, which requires compressing the
tensors first */
if (UNTAINT(r0) == UNTAINT(cr))
r0 = cr = JOIN_TAINT(r0, cr);
ego = (problem_rdft2 *)X(mkproblem)(sizeof(problem_rdft2), &padt);
if (sz->rnk > 1) { /* have to compress rnk-1 dims separately, ugh */
tensor *szc = X(tensor_copy_except)(sz, sz->rnk - 1);
tensor *szr = X(tensor_copy_sub)(sz, sz->rnk - 1, 1);
tensor *szcc = X(tensor_compress)(szc);
if (szcc->rnk > 0)
ego->sz = X(tensor_append)(szcc, szr);
else
ego->sz = X(tensor_compress)(szr);
X(tensor_destroy2)(szc, szr); X(tensor_destroy)(szcc);
} else {
ego->sz = X(tensor_compress)(sz);
}
ego->vecsz = X(tensor_compress_contiguous)(vecsz);
ego->r0 = r0;
ego->r1 = r1;
ego->cr = cr;
ego->ci = ci;
ego->kind = kind;
A(FINITE_RNK(ego->sz->rnk));
return &(ego->super);
}
/* Same as X(mkproblem_rdft2), but also destroy input tensors. */
problem *X(mkproblem_rdft2_d)(tensor *sz, tensor *vecsz,
R *r0, R *r1, R *cr, R *ci, rdft_kind kind)
{
problem *p = X(mkproblem_rdft2)(sz, vecsz, r0, r1, cr, ci, kind);
X(tensor_destroy2)(vecsz, sz);
return p;
}
/* Same as X(mkproblem_rdft2_d), but with only one R pointer.
Used by the API. */
problem *X(mkproblem_rdft2_d_3pointers)(tensor *sz, tensor *vecsz,
R *r0, R *cr, R *ci, rdft_kind kind)
{
problem *p;
int rnk = sz->rnk;
R *r1;
if (rnk == 0)
r1 = r0;
else if (R2HC_KINDP(kind)) {
r1 = r0 + sz->dims[rnk-1].is;
sz->dims[rnk-1].is *= 2;
} else {
r1 = r0 + sz->dims[rnk-1].os;
sz->dims[rnk-1].os *= 2;
}
p = X(mkproblem_rdft2)(sz, vecsz, r0, r1, cr, ci, kind);
X(tensor_destroy2)(vecsz, sz);
return p;
}

View File

@@ -0,0 +1,238 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for RDFT2 of rank >= 2 (multidimensional) */
#include "rdft/rdft.h"
#include "dft/dft.h"
typedef struct {
solver super;
int spltrnk;
const int *buddies;
size_t nbuddies;
} S;
typedef struct {
plan_dft super;
plan *cldr, *cldc;
const S *solver;
} P;
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
{
plan_rdft2 *cldr = (plan_rdft2 *) ego->cldr;
cldr->apply((plan *) cldr, r0, r1, cr, ci);
}
{
plan_dft *cldc = (plan_dft *) ego->cldc;
cldc->apply((plan *) cldc, cr, ci, cr, ci);
}
}
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
{
plan_dft *cldc = (plan_dft *) ego->cldc;
cldc->apply((plan *) cldc, ci, cr, ci, cr);
}
{
plan_rdft2 *cldr = (plan_rdft2 *) ego->cldr;
cldr->apply((plan *) cldr, r0, r1, cr, ci);
}
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldr, wakefulness);
X(plan_awake)(ego->cldc, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldr);
X(plan_destroy_internal)(ego->cldc);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->solver;
p->print(p, "(rdft2-rank>=2/%d%(%p%)%(%p%))",
s->spltrnk, ego->cldr, ego->cldc);
}
static int picksplit(const S *ego, const tensor *sz, int *rp)
{
A(sz->rnk > 1); /* cannot split rnk <= 1 */
if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
return 0;
*rp += 1; /* convert from dim. index to rank */
if (*rp >= sz->rnk) /* split must reduce rank */
return 0;
return 1;
}
static int applicable0(const solver *ego_, const problem *p_, int *rp,
const planner *plnr)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
const S *ego = (const S *)ego_;
return (1
&& FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
/* FIXME: multidimensional R2HCII ? */
&& (p->kind == R2HC || p->kind == HC2R)
&& p->sz->rnk >= 2
&& picksplit(ego, p->sz, rp)
&& (0
/* can work out-of-place, but HC2R destroys input */
|| (p->r0 != p->cr &&
(p->kind == R2HC || !NO_DESTROY_INPUTP(plnr)))
/* FIXME: what are sufficient conditions for inplace? */
|| (p->r0 == p->cr))
);
}
/* TODO: revise this. */
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr, int *rp)
{
const S *ego = (const S *)ego_;
if (!applicable0(ego_, p_, rp, plnr)) return 0;
if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0]))
return 0;
if (NO_UGLYP(plnr)) {
const problem_rdft2 *p = (const problem_rdft2 *) p_;
/* Heuristic: if the vector stride is greater than the transform
size, don't use (prefer to do the vector loop first with a
vrank-geq1 plan). */
if (p->vecsz->rnk > 0 &&
X(tensor_min_stride)(p->vecsz)
> X(rdft2_tensor_max_index)(p->sz, p->kind))
return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_rdft2 *p;
P *pln;
plan *cldr = 0, *cldc = 0;
tensor *sz1, *sz2, *vecszi, *sz2i;
int spltrnk;
inplace_kind k;
problem *cldp;
static const plan_adt padt = {
X(rdft2_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr, &spltrnk))
return (plan *) 0;
p = (const problem_rdft2 *) p_;
X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
k = p->kind == R2HC ? INPLACE_OS : INPLACE_IS;
vecszi = X(tensor_copy_inplace)(p->vecsz, k);
sz2i = X(tensor_copy_inplace)(sz2, k);
/* complex data is ~half of real */
sz2i->dims[sz2i->rnk - 1].n = sz2i->dims[sz2i->rnk - 1].n/2 + 1;
cldr = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(X(tensor_copy)(sz2),
X(tensor_append)(p->vecsz, sz1),
p->r0, p->r1,
p->cr, p->ci, p->kind));
if (!cldr) goto nada;
if (p->kind == R2HC)
cldp = X(mkproblem_dft_d)(X(tensor_copy_inplace)(sz1, k),
X(tensor_append)(vecszi, sz2i),
p->cr, p->ci, p->cr, p->ci);
else /* HC2R must swap re/im parts to get IDFT */
cldp = X(mkproblem_dft_d)(X(tensor_copy_inplace)(sz1, k),
X(tensor_append)(vecszi, sz2i),
p->ci, p->cr, p->ci, p->cr);
cldc = X(mkplan_d)(plnr, cldp);
if (!cldc) goto nada;
pln = MKPLAN_RDFT2(P, &padt, p->kind == R2HC ? apply_r2hc : apply_hc2r);
pln->cldr = cldr;
pln->cldc = cldc;
pln->solver = ego;
X(ops_add)(&cldr->ops, &cldc->ops, &pln->super.super.ops);
X(tensor_destroy4)(sz2i, vecszi, sz2, sz1);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldr);
X(plan_destroy_internal)(cldc);
X(tensor_destroy4)(sz2i, vecszi, sz2, sz1);
return (plan *) 0;
}
static solver *mksolver(int spltrnk, const int *buddies, size_t nbuddies)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->spltrnk = spltrnk;
slv->buddies = buddies;
slv->nbuddies = nbuddies;
return &(slv->super);
}
void X(rdft2_rank_geq2_register)(planner *p)
{
static const int buddies[] = { 1, 0, -2 };
size_t i;
for (i = 0; i < NELEM(buddies); ++i)
REGISTER_SOLVER(p, mksolver(buddies[i], buddies, NELEM(buddies)));
/* FIXME: Should we try more buddies? See also dft/rank-geq2. */
}

View File

@@ -0,0 +1,207 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for RDFT of rank >= 2 (multidimensional) */
/* FIXME: this solver cannot strictly be applied to multidimensional
DHTs, since the latter are not separable...up to rnk-1 additional
post-processing passes may be required. See also:
R. N. Bracewell, O. Buneman, H. Hao, and J. Villasenor, "Fast
two-dimensional Hartley transform," Proc. IEEE 74, 1282-1283 (1986).
H. Hao and R. N. Bracewell, "A three-dimensional DFT algorithm
using the fast Hartley transform," Proc. IEEE 75(2), 264-266 (1987).
*/
#include "rdft/rdft.h"
typedef struct {
solver super;
int spltrnk;
const int *buddies;
size_t nbuddies;
} S;
typedef struct {
plan_rdft super;
plan *cld1, *cld2;
const S *solver;
} P;
/* Compute multi-dimensional RDFT by applying the two cld plans
(lower-rnk RDFTs). */
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld1, *cld2;
cld1 = (plan_rdft *) ego->cld1;
cld1->apply(ego->cld1, I, O);
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, O, O);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const S *s = ego->solver;
p->print(p, "(rdft-rank>=2/%d%(%p%)%(%p%))",
s->spltrnk, ego->cld1, ego->cld2);
}
static int picksplit(const S *ego, const tensor *sz, int *rp)
{
A(sz->rnk > 1); /* cannot split rnk <= 1 */
if (!X(pickdim)(ego->spltrnk, ego->buddies, ego->nbuddies, sz, 1, rp))
return 0;
*rp += 1; /* convert from dim. index to rank */
if (*rp >= sz->rnk) /* split must reduce rank */
return 0;
return 1;
}
static int applicable0(const solver *ego_, const problem *p_, int *rp)
{
const problem_rdft *p = (const problem_rdft *) p_;
const S *ego = (const S *)ego_;
return (1
&& FINITE_RNK(p->sz->rnk) && FINITE_RNK(p->vecsz->rnk)
&& p->sz->rnk >= 2
&& picksplit(ego, p->sz, rp)
);
}
/* TODO: revise this. */
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr, int *rp)
{
const S *ego = (const S *)ego_;
if (!applicable0(ego_, p_, rp)) return 0;
if (NO_RANK_SPLITSP(plnr) && (ego->spltrnk != ego->buddies[0]))
return 0;
if (NO_UGLYP(plnr)) {
/* Heuristic: if the vector stride is greater than the transform
sz, don't use (prefer to do the vector loop first with a
vrank-geq1 plan). */
const problem_rdft *p = (const problem_rdft *) p_;
if (p->vecsz->rnk > 0 &&
X(tensor_min_stride)(p->vecsz) > X(tensor_max_index)(p->sz))
return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_rdft *p;
P *pln;
plan *cld1 = 0, *cld2 = 0;
tensor *sz1, *sz2, *vecszi, *sz2i;
int spltrnk;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr, &spltrnk))
return (plan *) 0;
p = (const problem_rdft *) p_;
X(tensor_split)(p->sz, &sz1, spltrnk, &sz2);
vecszi = X(tensor_copy_inplace)(p->vecsz, INPLACE_OS);
sz2i = X(tensor_copy_inplace)(sz2, INPLACE_OS);
cld1 = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(X(tensor_copy)(sz2),
X(tensor_append)(p->vecsz, sz1),
p->I, p->O, p->kind + spltrnk));
if (!cld1) goto nada;
cld2 = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(
X(tensor_copy_inplace)(sz1, INPLACE_OS),
X(tensor_append)(vecszi, sz2i),
p->O, p->O, p->kind));
if (!cld2) goto nada;
pln = MKPLAN_RDFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->solver = ego;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
X(tensor_destroy4)(sz2, sz1, vecszi, sz2i);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
X(tensor_destroy4)(sz2, sz1, vecszi, sz2i);
return (plan *) 0;
}
static solver *mksolver(int spltrnk, const int *buddies, size_t nbuddies)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->spltrnk = spltrnk;
slv->buddies = buddies;
slv->nbuddies = nbuddies;
return &(slv->super);
}
void X(rdft_rank_geq2_register)(planner *p)
{
static const int buddies[] = { 1, 0, -2 };
size_t i;
for (i = 0; i < NELEM(buddies); ++i)
REGISTER_SOLVER(p, mksolver(buddies[i], buddies, NELEM(buddies)));
/* FIXME: Should we try more buddies? See also dft/rank-geq2. */
}

View File

@@ -0,0 +1,199 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for rank-0 RDFT2 (copy operations, plus setting 0 imag. parts) */
#include "rdft/rdft.h"
#ifdef HAVE_STRING_H
#include <string.h> /* for memcpy() */
#endif
typedef struct {
solver super;
} S;
typedef struct {
plan_rdft super;
INT vl;
INT ivs, ovs;
plan *cldcpy;
} P;
static int applicable(const problem *p_)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
return (1
&& p->sz->rnk == 0
&& (p->kind == HC2R
||
(1
&& p->kind == R2HC
&& p->vecsz->rnk <= 1
&& ((p->r0 != p->cr)
||
X(rdft2_inplace_strides)(p, RNK_MINFTY)) ))
);
}
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
INT i, vl = ego->vl;
INT ivs = ego->ivs, ovs = ego->ovs;
UNUSED(r1); /* rank-0 has no real odd-index elements */
for (i = 4; i <= vl; i += 4) {
R x0, x1, x2, x3;
x0 = *r0; r0 += ivs;
x1 = *r0; r0 += ivs;
x2 = *r0; r0 += ivs;
x3 = *r0; r0 += ivs;
*cr = x0; cr += ovs;
*ci = K(0.0); ci += ovs;
*cr = x1; cr += ovs;
*ci = K(0.0); ci += ovs;
*cr = x2; cr += ovs;
*ci = K(0.0); ci += ovs;
*cr = x3; cr += ovs;
*ci = K(0.0); ci += ovs;
}
for (; i < vl + 4; ++i) {
R x0;
x0 = *r0; r0 += ivs;
*cr = x0; cr += ovs;
*ci = K(0.0); ci += ovs;
}
}
/* in-place r2hc rank-0: set imaginary parts of output to 0 */
static void apply_r2hc_inplace(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
INT i, vl = ego->vl;
INT ovs = ego->ovs;
UNUSED(r0); UNUSED(r1); UNUSED(cr);
for (i = 4; i <= vl; i += 4) {
*ci = K(0.0); ci += ovs;
*ci = K(0.0); ci += ovs;
*ci = K(0.0); ci += ovs;
*ci = K(0.0); ci += ovs;
}
for (; i < vl + 4; ++i) {
*ci = K(0.0); ci += ovs;
}
}
/* a rank-0 HC2R rdft2 problem is just a copy from cr to r0,
so we can use a rank-0 rdft plan */
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft *cldcpy = (plan_rdft *) ego->cldcpy;
UNUSED(ci);
UNUSED(r1);
cldcpy->apply((plan *) cldcpy, cr, r0);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
if (ego->cldcpy)
X(plan_awake)(ego->cldcpy, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
if (ego->cldcpy)
X(plan_destroy_internal)(ego->cldcpy);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
if (ego->cldcpy)
p->print(p, "(rdft2-hc2r-rank0%(%p%))", ego->cldcpy);
else
p->print(p, "(rdft2-r2hc-rank0%v)", ego->vl);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const problem_rdft2 *p;
plan *cldcpy = (plan *) 0;
P *pln;
static const plan_adt padt = {
X(rdft2_solve), awake, print, destroy
};
UNUSED(ego_);
if (!applicable(p_))
return (plan *) 0;
p = (const problem_rdft2 *) p_;
if (p->kind == HC2R) {
cldcpy = X(mkplan_d)(plnr,
X(mkproblem_rdft_0_d)(
X(tensor_copy)(p->vecsz),
p->cr, p->r0));
if (!cldcpy) return (plan *) 0;
}
pln = MKPLAN_RDFT2(P, &padt,
p->kind == R2HC ?
(p->r0 == p->cr ? apply_r2hc_inplace : apply_r2hc)
: apply_hc2r);
if (p->kind == R2HC)
X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
pln->cldcpy = cldcpy;
if (p->kind == R2HC) {
/* vl loads, 2*vl stores */
X(ops_other)(3 * pln->vl, &pln->super.super.ops);
}
else {
pln->super.super.ops = cldcpy->ops;
}
return &(pln->super.super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(rdft2_rank0_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

381
fftw-3.3.10/rdft/rank0.c Normal file
View File

@@ -0,0 +1,381 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for rank-0 RDFTs (copy operations) */
#include "rdft/rdft.h"
#ifdef HAVE_STRING_H
#include <string.h> /* for memcpy() */
#endif
#define MAXRNK 32 /* FIXME: should malloc() */
typedef struct {
plan_rdft super;
INT vl;
int rnk;
iodim d[MAXRNK];
const char *nam;
} P;
typedef struct {
solver super;
rdftapply apply;
int (*applicable)(const P *pln, const problem_rdft *p);
const char *nam;
} S;
/* copy up to MAXRNK dimensions from problem into plan. If a
contiguous dimension exists, save its length in pln->vl */
static int fill_iodim(P *pln, const problem_rdft *p)
{
int i;
const tensor *vecsz = p->vecsz;
pln->vl = 1;
pln->rnk = 0;
for (i = 0; i < vecsz->rnk; ++i) {
/* extract contiguous dimensions */
if (pln->vl == 1 &&
vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1)
pln->vl = vecsz->dims[i].n;
else if (pln->rnk == MAXRNK)
return 0;
else
pln->d[pln->rnk++] = vecsz->dims[i];
}
return 1;
}
/* generic higher-rank copy routine, calls cpy2d() to do the real work */
static void copy(const iodim *d, int rnk, INT vl,
R *I, R *O,
cpy2d_func cpy2d)
{
A(rnk >= 2);
if (rnk == 2)
cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl);
else {
INT i;
for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os)
copy(d + 1, rnk - 1, vl, I, O, cpy2d);
}
}
/* FIXME: should be more general */
static int transposep(const P *pln)
{
int i;
for (i = 0; i < pln->rnk - 2; ++i)
if (pln->d[i].is != pln->d[i].os)
return 0;
return (pln->d[i].n == pln->d[i+1].n &&
pln->d[i].is == pln->d[i+1].os &&
pln->d[i].os == pln->d[i+1].is);
}
/* generic higher-rank transpose routine, calls transpose2d() to do
* the real work */
static void transpose(const iodim *d, int rnk, INT vl,
R *I,
transpose_func transpose2d)
{
A(rnk >= 2);
if (rnk == 2)
transpose2d(I, d[0].n, d[0].is, d[0].os, vl);
else {
INT i;
for (i = 0; i < d[0].n; ++i, I += d[0].is)
transpose(d + 1, rnk - 1, vl, I, transpose2d);
}
}
/**************************************************************/
/* rank 0,1,2, out of place, iterative */
static void apply_iter(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
switch (ego->rnk) {
case 0:
X(cpy1d)(I, O, ego->vl, 1, 1, 1);
break;
case 1:
X(cpy1d)(I, O,
ego->d[0].n, ego->d[0].is, ego->d[0].os,
ego->vl);
break;
default:
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci));
break;
}
}
static int applicable_iter(const P *pln, const problem_rdft *p)
{
UNUSED(pln);
return (p->I != p->O);
}
/**************************************************************/
/* out of place, write contiguous output */
static void apply_cpy2dco(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co));
}
static int applicable_cpy2dco(const P *pln, const problem_rdft *p)
{
int rnk = pln->rnk;
return (1
&& p->I != p->O
&& rnk >= 2
/* must not duplicate apply_iter */
&& (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is)
||
X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os))
);
}
/**************************************************************/
/* out of place, tiled, no buffering */
static void apply_tiled(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled));
}
static int applicable_tiled(const P *pln, const problem_rdft *p)
{
return (1
&& p->I != p->O
&& pln->rnk >= 2
/* somewhat arbitrary */
&& X(compute_tilesz)(pln->vl, 1) > 4
);
}
/**************************************************************/
/* out of place, tiled, with buffer */
static void apply_tiledbuf(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf));
}
#define applicable_tiledbuf applicable_tiled
/**************************************************************/
/* rank 0, out of place, using memcpy */
static void apply_memcpy(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
A(ego->rnk == 0);
memcpy(O, I, ego->vl * sizeof(R));
}
static int applicable_memcpy(const P *pln, const problem_rdft *p)
{
return (1
&& p->I != p->O
&& pln->rnk == 0
&& pln->vl > 2 /* do not bother memcpy-ing complex numbers */
);
}
/**************************************************************/
/* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place
transposes of vl-tuples ... for large vl it should be more
efficient to use memcpy than the tiled stuff). */
static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O)
{
INT i, n = d->n, is = d->is, os = d->os;
if (rnk == 1)
for (i = 0; i < n; ++i, I += is, O += os)
memcpy(O, I, cpysz);
else {
--rnk; ++d;
for (i = 0; i < n; ++i, I += is, O += os)
memcpy_loop(cpysz, rnk, d, I, O);
}
}
static void apply_memcpy_loop(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O);
}
static int applicable_memcpy_loop(const P *pln, const problem_rdft *p)
{
return (p->I != p->O
&& pln->rnk > 0
&& pln->vl > 2 /* do not bother memcpy-ing complex numbers */);
}
/**************************************************************/
/* rank 2, in place, square transpose, iterative */
static void apply_ip_sq(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
UNUSED(O);
transpose(ego->d, ego->rnk, ego->vl, I, X(transpose));
}
static int applicable_ip_sq(const P *pln, const problem_rdft *p)
{
return (1
&& p->I == p->O
&& pln->rnk >= 2
&& transposep(pln));
}
/**************************************************************/
/* rank 2, in place, square transpose, tiled */
static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
UNUSED(O);
transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled));
}
static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p)
{
return (1
&& applicable_ip_sq(pln, p)
/* somewhat arbitrary */
&& X(compute_tilesz)(pln->vl, 2) > 4
);
}
/**************************************************************/
/* rank 2, in place, square transpose, tiled, buffered */
static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
UNUSED(O);
transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf));
}
#define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled
/**************************************************************/
static int applicable(const S *ego, const problem *p_)
{
const problem_rdft *p = (const problem_rdft *) p_;
P pln;
return (1
&& p->sz->rnk == 0
&& FINITE_RNK(p->vecsz->rnk)
&& fill_iodim(&pln, p)
&& ego->applicable(&pln, p)
);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
int i;
p->print(p, "(%s/%D", ego->nam, ego->vl);
for (i = 0; i < ego->rnk; ++i)
p->print(p, "%v", ego->d[i].n);
p->print(p, ")");
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const problem_rdft *p;
const S *ego = (const S *) ego_;
P *pln;
int retval;
static const plan_adt padt = {
X(rdft_solve), X(null_awake), print, X(plan_null_destroy)
};
UNUSED(plnr);
if (!applicable(ego, p_))
return (plan *) 0;
p = (const problem_rdft *) p_;
pln = MKPLAN_RDFT(P, &padt, ego->apply);
retval = fill_iodim(pln, p);
(void)retval; /* UNUSED unless DEBUG */
A(retval);
A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */
pln->nam = ego->nam;
/* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */
X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops);
return &(pln->super.super);
}
void X(rdft_rank0_register)(planner *p)
{
unsigned i;
static struct {
rdftapply apply;
int (*applicable)(const P *, const problem_rdft *);
const char *nam;
} tab[] = {
{ apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" },
{ apply_memcpy_loop, applicable_memcpy_loop,
"rdft-rank0-memcpy-loop" },
{ apply_iter, applicable_iter, "rdft-rank0-iter-ci" },
{ apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" },
{ apply_tiled, applicable_tiled, "rdft-rank0-tiled" },
{ apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" },
{ apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" },
{
apply_ip_sq_tiled,
applicable_ip_sq_tiled,
"rdft-rank0-ip-sq-tiled"
},
{
apply_ip_sq_tiledbuf,
applicable_ip_sq_tiledbuf,
"rdft-rank0-ip-sq-tiledbuf"
},
};
for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) {
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->apply = tab[i].apply;
slv->applicable = tab[i].applicable;
slv->nam = tab[i].nam;
REGISTER_SOLVER(p, &(slv->super));
}
}

220
fftw-3.3.10/rdft/rdft-dht.c Normal file
View File

@@ -0,0 +1,220 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Solve an R2HC/HC2R problem via post/pre processing of a DHT. This
is mainly useful because we can use Rader to compute DHTs of prime
sizes. It also allows us to express hc2r problems in terms of r2hc
(via dht-r2hc), and to do hc2r problems without destroying the input. */
#include "rdft/rdft.h"
typedef struct {
solver super;
} S;
typedef struct {
plan_rdft super;
plan *cld;
INT is, os;
INT n;
} P;
static void apply_r2hc(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT os;
INT i, n;
{
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply((plan *) cld, I, O);
}
n = ego->n;
os = ego->os;
for (i = 1; i < n - i; ++i) {
E a, b;
a = K(0.5) * O[os * i];
b = K(0.5) * O[os * (n - i)];
O[os * i] = a + b;
#if FFT_SIGN == -1
O[os * (n - i)] = b - a;
#else
O[os * (n - i)] = a - b;
#endif
}
}
/* hc2r, destroying input as usual */
static void apply_hc2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT is = ego->is;
INT i, n = ego->n;
for (i = 1; i < n - i; ++i) {
E a, b;
a = I[is * i];
b = I[is * (n - i)];
#if FFT_SIGN == -1
I[is * i] = a - b;
I[is * (n - i)] = a + b;
#else
I[is * i] = a + b;
I[is * (n - i)] = a - b;
#endif
}
{
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply((plan *) cld, I, O);
}
}
/* hc2r, without destroying input */
static void apply_hc2r_save(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
INT is = ego->is, os = ego->os;
INT i, n = ego->n;
O[0] = I[0];
for (i = 1; i < n - i; ++i) {
E a, b;
a = I[is * i];
b = I[is * (n - i)];
#if FFT_SIGN == -1
O[os * i] = a - b;
O[os * (n - i)] = a + b;
#else
O[os * i] = a + b;
O[os * (n - i)] = a - b;
#endif
}
if (i == n - i)
O[os * i] = I[is * i];
{
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply((plan *) cld, O, O);
}
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(%s-dht-%D%(%p%))",
ego->super.apply == apply_r2hc ? "r2hc" : "hc2r",
ego->n, ego->cld);
}
static int applicable0(const solver *ego_, const problem *p_)
{
const problem_rdft *p = (const problem_rdft *) p_;
UNUSED(ego_);
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& (p->kind[0] == R2HC || p->kind[0] == HC2R)
/* hack: size-2 DHT etc. are defined as being equivalent
to size-2 R2HC in problem.c, so we need this to prevent
infinite loops for size 2 in EXHAUSTIVE mode: */
&& p->sz->dims[0].n > 2
);
}
static int applicable(const solver *ego, const problem *p_,
const planner *plnr)
{
return (!NO_SLOWP(plnr) && applicable0(ego, p_));
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
P *pln;
const problem_rdft *p;
problem *cldp;
plan *cld;
static const plan_adt padt = {
X(rdft_solve), awake, print, destroy
};
if (!applicable(ego_, p_, plnr))
return (plan *)0;
p = (const problem_rdft *) p_;
if (p->kind[0] == R2HC || !NO_DESTROY_INPUTP(plnr))
cldp = X(mkproblem_rdft_1)(p->sz, p->vecsz, p->I, p->O, DHT);
else {
tensor *sz = X(tensor_copy_inplace)(p->sz, INPLACE_OS);
cldp = X(mkproblem_rdft_1)(sz, p->vecsz, p->O, p->O, DHT);
X(tensor_destroy)(sz);
}
cld = X(mkplan_d)(plnr, cldp);
if (!cld) return (plan *)0;
pln = MKPLAN_RDFT(P, &padt, p->kind[0] == R2HC ?
apply_r2hc : (NO_DESTROY_INPUTP(plnr) ?
apply_hc2r_save : apply_hc2r));
pln->n = p->sz->dims[0].n;
pln->is = p->sz->dims[0].is;
pln->os = p->sz->dims[0].os;
pln->cld = cld;
pln->super.super.ops = cld->ops;
pln->super.super.ops.other += 4 * ((pln->n - 1)/2);
pln->super.super.ops.add += 2 * ((pln->n - 1)/2);
if (p->kind[0] == R2HC)
pln->super.super.ops.mul += 2 * ((pln->n - 1)/2);
if (pln->super.apply == apply_hc2r_save)
pln->super.super.ops.other += 2 + (pln->n % 2 ? 0 : 2);
return &(pln->super.super);
}
/* constructor */
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(rdft_dht_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

176
fftw-3.3.10/rdft/rdft.h Normal file
View File

@@ -0,0 +1,176 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef __RDFT_H__
#define __RDFT_H__
#include "kernel/ifftw.h"
#include "rdft/codelet-rdft.h"
#ifdef __cplusplus
extern "C"
{
#endif /* __cplusplus */
/* problem.c: */
typedef struct {
problem super;
tensor *sz, *vecsz;
R *I, *O;
#if defined(STRUCT_HACK_KR)
rdft_kind kind[1];
#elif defined(STRUCT_HACK_C99)
rdft_kind kind[];
#else
rdft_kind *kind;
#endif
} problem_rdft;
void X(rdft_zerotens)(tensor *sz, R *I);
problem *X(mkproblem_rdft)(const tensor *sz, const tensor *vecsz,
R *I, R *O, const rdft_kind *kind);
problem *X(mkproblem_rdft_d)(tensor *sz, tensor *vecsz,
R *I, R *O, const rdft_kind *kind);
problem *X(mkproblem_rdft_0_d)(tensor *vecsz, R *I, R *O);
problem *X(mkproblem_rdft_1)(const tensor *sz, const tensor *vecsz,
R *I, R *O, rdft_kind kind);
problem *X(mkproblem_rdft_1_d)(tensor *sz, tensor *vecsz,
R *I, R *O, rdft_kind kind);
const char *X(rdft_kind_str)(rdft_kind kind);
/* solve.c: */
void X(rdft_solve)(const plan *ego_, const problem *p_);
/* plan.c: */
typedef void (*rdftapply) (const plan *ego, R *I, R *O);
typedef struct {
plan super;
rdftapply apply;
} plan_rdft;
plan *X(mkplan_rdft)(size_t size, const plan_adt *adt, rdftapply apply);
#define MKPLAN_RDFT(type, adt, apply) \
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
/* various solvers */
solver *X(mksolver_rdft_r2c_direct)(kr2c k, const kr2c_desc *desc);
solver *X(mksolver_rdft_r2c_directbuf)(kr2c k, const kr2c_desc *desc);
solver *X(mksolver_rdft_r2r_direct)(kr2r k, const kr2r_desc *desc);
void X(rdft_rank0_register)(planner *p);
void X(rdft_vrank3_transpose_register)(planner *p);
void X(rdft_rank_geq2_register)(planner *p);
void X(rdft_indirect_register)(planner *p);
void X(rdft_vrank_geq1_register)(planner *p);
void X(rdft_buffered_register)(planner *p);
void X(rdft_generic_register)(planner *p);
void X(rdft_rader_hc2hc_register)(planner *p);
void X(rdft_dht_register)(planner *p);
void X(dht_r2hc_register)(planner *p);
void X(dht_rader_register)(planner *p);
void X(dft_r2hc_register)(planner *p);
void X(rdft_nop_register)(planner *p);
void X(hc2hc_generic_register)(planner *p);
/****************************************************************************/
/* problem2.c: */
/*
An RDFT2 problem transforms a 1d real array r[n] with stride is/os
to/from an "unpacked" complex array {rio,iio}[n/2 + 1] with stride
os/is. R0 points to the first even element of the real array.
R1 points to the first odd element of the real array.
Strides on the real side of the transform express distances
between consecutive elements of the same array (even or odd).
E.g., for a contiguous input
R0 R1 R2 R3 ...
the input stride would be 2, not 1. This convention is necessary
for hc2c codelets to work, since they transpose even/odd with
real/imag.
Multidimensional transforms use complex DFTs for the
noncontiguous dimensions. vecsz has the usual interpretation.
*/
typedef struct {
problem super;
tensor *sz;
tensor *vecsz;
R *r0, *r1;
R *cr, *ci;
rdft_kind kind; /* assert(kind < DHT) */
} problem_rdft2;
problem *X(mkproblem_rdft2)(const tensor *sz, const tensor *vecsz,
R *r0, R *r1, R *cr, R *ci, rdft_kind kind);
problem *X(mkproblem_rdft2_d)(tensor *sz, tensor *vecsz,
R *r0, R *r1, R *cr, R *ci, rdft_kind kind);
problem *X(mkproblem_rdft2_d_3pointers)(tensor *sz, tensor *vecsz,
R *r, R *cr, R *ci, rdft_kind kind);
int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim);
INT X(rdft2_tensor_max_index)(const tensor *sz, rdft_kind k);
void X(rdft2_strides)(rdft_kind kind, const iodim *d, INT *rs, INT *cs);
INT X(rdft2_complex_n)(INT real_n, rdft_kind kind);
/* verify.c: */
void X(rdft2_verify)(plan *pln, const problem_rdft2 *p, int rounds);
/* solve.c: */
void X(rdft2_solve)(const plan *ego_, const problem *p_);
/* plan.c: */
typedef void (*rdft2apply) (const plan *ego, R *r0, R *r1, R *cr, R *ci);
typedef struct {
plan super;
rdft2apply apply;
} plan_rdft2;
plan *X(mkplan_rdft2)(size_t size, const plan_adt *adt, rdft2apply apply);
#define MKPLAN_RDFT2(type, adt, apply) \
(type *)X(mkplan_rdft2)(sizeof(type), adt, apply)
/* various solvers */
solver *X(mksolver_rdft2_direct)(kr2c k, const kr2c_desc *desc);
void X(rdft2_vrank_geq1_register)(planner *p);
void X(rdft2_buffered_register)(planner *p);
void X(rdft2_rdft_register)(planner *p);
void X(rdft2_nop_register)(planner *p);
void X(rdft2_rank0_register)(planner *p);
void X(rdft2_rank_geq2_register)(planner *p);
/****************************************************************************/
/* configurations */
void X(rdft_conf_standard)(planner *p);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* __RDFT_H__ */

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
/* Check if the vecsz/sz strides are consistent with the problem
being in-place for vecsz.dim[vdim], or for all dimensions
if vdim == RNK_MINFTY. We can't just use tensor_inplace_strides
because rdft transforms have the unfortunate property of
differing input and output sizes. This routine is not
exhaustive; we only return 1 for the most common case. */
int X(rdft2_inplace_strides)(const problem_rdft2 *p, int vdim)
{
INT N, Nc;
INT rs, cs;
int i;
for (i = 0; i + 1 < p->sz->rnk; ++i)
if (p->sz->dims[i].is != p->sz->dims[i].os)
return 0;
if (!FINITE_RNK(p->vecsz->rnk) || p->vecsz->rnk == 0)
return 1;
if (!FINITE_RNK(vdim)) { /* check all vector dimensions */
for (vdim = 0; vdim < p->vecsz->rnk; ++vdim)
if (!X(rdft2_inplace_strides)(p, vdim))
return 0;
return 1;
}
A(vdim < p->vecsz->rnk);
if (p->sz->rnk == 0)
return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os);
N = X(tensor_sz)(p->sz);
Nc = (N / p->sz->dims[p->sz->rnk-1].n) *
(p->sz->dims[p->sz->rnk-1].n/2 + 1);
X(rdft2_strides)(p->kind, p->sz->dims + p->sz->rnk - 1, &rs, &cs);
/* the factor of 2 comes from the fact that RS is the stride
of p->r0 and p->r1, which is twice as large as the strides
in the r2r case */
return(p->vecsz->dims[vdim].is == p->vecsz->dims[vdim].os
&& (X(iabs)(2 * p->vecsz->dims[vdim].os)
>= X(imax)(2 * Nc * X(iabs)(cs), N * X(iabs)(rs))));
}

View File

@@ -0,0 +1,328 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
typedef struct {
solver super;
} S;
typedef struct {
plan_rdft2 super;
plan *cld, *cldrest;
INT n, vl, nbuf, bufdist;
INT cs, ivs, ovs;
} P;
/***************************************************************************/
/* FIXME: have alternate copy functions that push a vector loop inside
the n loops? */
/* copy halfcomplex array r (contiguous) to complex (strided) array rio/iio. */
static void hc2c(INT n, R *r, R *rio, R *iio, INT os)
{
INT i;
rio[0] = r[0];
iio[0] = 0;
for (i = 1; i + i < n; ++i) {
rio[i * os] = r[i];
iio[i * os] = r[n - i];
}
if (i + i == n) { /* store the Nyquist frequency */
rio[i * os] = r[i];
iio[i * os] = K(0.0);
}
}
/* reverse of hc2c */
static void c2hc(INT n, R *rio, R *iio, INT is, R *r)
{
INT i;
r[0] = rio[0];
for (i = 1; i + i < n; ++i) {
r[i] = rio[i * is];
r[n - i] = iio[i * is];
}
if (i + i == n) /* store the Nyquist frequency */
r[i] = rio[i * is];
}
/***************************************************************************/
static void apply_r2hc(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft *cld = (plan_rdft *) ego->cld;
INT i, j, vl = ego->vl, nbuf = ego->nbuf, bufdist = ego->bufdist;
INT n = ego->n;
INT ivs = ego->ivs, ovs = ego->ovs, os = ego->cs;
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
plan_rdft2 *cldrest;
for (i = nbuf; i <= vl; i += nbuf) {
/* transform to bufs: */
cld->apply((plan *) cld, r0, bufs);
r0 += ivs * nbuf; r1 += ivs * nbuf;
/* copy back */
for (j = 0; j < nbuf; ++j, cr += ovs, ci += ovs)
hc2c(n, bufs + j*bufdist, cr, ci, os);
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_rdft2 *) ego->cldrest;
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
}
static void apply_hc2r(const plan *ego_, R *r0, R *r1, R *cr, R *ci)
{
const P *ego = (const P *) ego_;
plan_rdft *cld = (plan_rdft *) ego->cld;
INT i, j, vl = ego->vl, nbuf = ego->nbuf, bufdist = ego->bufdist;
INT n = ego->n;
INT ivs = ego->ivs, ovs = ego->ovs, is = ego->cs;
R *bufs = (R *)MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
plan_rdft2 *cldrest;
for (i = nbuf; i <= vl; i += nbuf) {
/* copy to bufs */
for (j = 0; j < nbuf; ++j, cr += ivs, ci += ivs)
c2hc(n, cr, ci, is, bufs + j*bufdist);
/* transform back: */
cld->apply((plan *) cld, bufs, r0);
r0 += ovs * nbuf; r1 += ovs * nbuf;
}
X(ifree)(bufs);
/* Do the remaining transforms, if any: */
cldrest = (plan_rdft2 *) ego->cldrest;
cldrest->apply((plan *) cldrest, r0, r1, cr, ci);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldrest, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldrest);
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(rdft2-rdft-%s-%D%v/%D-%D%(%p%)%(%p%))",
ego->super.apply == apply_r2hc ? "r2hc" : "hc2r",
ego->n, ego->nbuf,
ego->vl, ego->bufdist % ego->n,
ego->cld, ego->cldrest);
}
static INT min_nbuf(const problem_rdft2 *p, INT n, INT vl)
{
INT is, os, ivs, ovs;
if (p->r0 != p->cr)
return 1;
if (X(rdft2_inplace_strides(p, RNK_MINFTY)))
return 1;
A(p->vecsz->rnk == 1); /* rank 0 and MINFTY are inplace */
X(rdft2_strides)(p->kind, p->sz->dims, &is, &os);
X(rdft2_strides)(p->kind, p->vecsz->dims, &ivs, &ovs);
/* handle one potentially common case: "contiguous" real and
complex arrays, which overlap because of the differing sizes. */
if (n * X(iabs)(is) <= X(iabs)(ivs)
&& (n/2 + 1) * X(iabs)(os) <= X(iabs)(ovs)
&& ( ((p->cr - p->ci) <= X(iabs)(os)) ||
((p->ci - p->cr) <= X(iabs)(os)) )
&& ivs > 0 && ovs > 0) {
INT vsmin = X(imin)(ivs, ovs);
INT vsmax = X(imax)(ivs, ovs);
return(((vsmax - vsmin) * vl + vsmin - 1) / vsmin);
}
return vl; /* punt: just buffer the whole vector */
}
static int applicable0(const problem *p_, const S *ego, const planner *plnr)
{
const problem_rdft2 *p = (const problem_rdft2 *) p_;
UNUSED(ego);
return(1
&& p->vecsz->rnk <= 1
&& p->sz->rnk == 1
/* FIXME: does it make sense to do R2HCII ? */
&& (p->kind == R2HC || p->kind == HC2R)
/* real strides must allow for reduction to rdft */
&& (2 * (p->r1 - p->r0) ==
(((p->kind == R2HC) ? p->sz->dims[0].is : p->sz->dims[0].os)))
&& !(X(toobig)(p->sz->dims[0].n) && CONSERVE_MEMORYP(plnr))
);
}
static int applicable(const problem *p_, const S *ego, const planner *plnr)
{
const problem_rdft2 *p;
if (NO_BUFFERINGP(plnr)) return 0;
if (!applicable0(p_, ego, plnr)) return 0;
p = (const problem_rdft2 *) p_;
if (NO_UGLYP(plnr)) {
if (p->r0 != p->cr) return 0;
if (X(toobig)(p->sz->dims[0].n)) return 0;
}
return 1;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
P *pln;
plan *cld = (plan *) 0;
plan *cldrest = (plan *) 0;
const problem_rdft2 *p = (const problem_rdft2 *) p_;
R *bufs = (R *) 0;
INT nbuf = 0, bufdist, n, vl;
INT ivs, ovs, rs, id, od;
static const plan_adt padt = {
X(rdft2_solve), awake, print, destroy
};
if (!applicable(p_, ego, plnr))
goto nada;
n = p->sz->dims[0].n;
X(tensor_tornk1)(p->vecsz, &vl, &ivs, &ovs);
nbuf = X(imax)(X(nbuf)(n, vl, 0), min_nbuf(p, n, vl));
bufdist = X(bufdist)(n, vl);
A(nbuf > 0);
/* initial allocation for the purpose of planning */
bufs = (R *) MALLOC(sizeof(R) * nbuf * bufdist, BUFFERS);
id = ivs * (nbuf * (vl / nbuf));
od = ovs * (nbuf * (vl / nbuf));
if (p->kind == R2HC) {
cld = X(mkplan_f_d)(
plnr,
X(mkproblem_rdft_d)(
X(mktensor_1d)(n, p->sz->dims[0].is/2, 1),
X(mktensor_1d)(nbuf, ivs, bufdist),
TAINT(p->r0, ivs * nbuf), bufs, &p->kind),
0, 0, (p->r0 == p->cr) ? NO_DESTROY_INPUT : 0);
if (!cld) goto nada;
X(ifree)(bufs); bufs = 0;
cldrest = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(
X(tensor_copy)(p->sz),
X(mktensor_1d)(vl % nbuf, ivs, ovs),
p->r0 + id, p->r1 + id,
p->cr + od, p->ci + od,
p->kind));
if (!cldrest) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_r2hc);
} else {
A(p->kind == HC2R);
cld = X(mkplan_f_d)(
plnr,
X(mkproblem_rdft_d)(
X(mktensor_1d)(n, 1, p->sz->dims[0].os/2),
X(mktensor_1d)(nbuf, bufdist, ovs),
bufs, TAINT(p->r0, ovs * nbuf), &p->kind),
0, 0, NO_DESTROY_INPUT); /* always ok to destroy bufs */
if (!cld) goto nada;
X(ifree)(bufs); bufs = 0;
cldrest = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(
X(tensor_copy)(p->sz),
X(mktensor_1d)(vl % nbuf, ivs, ovs),
p->r0 + od, p->r1 + od,
p->cr + id, p->ci + id,
p->kind));
if (!cldrest) goto nada;
pln = MKPLAN_RDFT2(P, &padt, apply_hc2r);
}
pln->cld = cld;
pln->cldrest = cldrest;
pln->n = n;
pln->vl = vl;
pln->ivs = ivs;
pln->ovs = ovs;
X(rdft2_strides)(p->kind, &p->sz->dims[0], &rs, &pln->cs);
pln->nbuf = nbuf;
pln->bufdist = bufdist;
X(ops_madd)(vl / nbuf, &cld->ops, &cldrest->ops,
&pln->super.super.ops);
pln->super.super.ops.other += (p->kind == R2HC ? (n + 2) : n) * vl;
return &(pln->super.super);
nada:
X(ifree0)(bufs);
X(plan_destroy_internal)(cldrest);
X(plan_destroy_internal)(cld);
return (plan *) 0;
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
return &(slv->super);
}
void X(rdft2_rdft_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
/* Deal with annoyance because the tensor (is,os) applies to
(r,rio/iio) for R2HC and vice-versa for HC2R. We originally had
(is,os) always apply to (r,rio/iio), but this causes other
headaches with the tensor functions. */
void X(rdft2_strides)(rdft_kind kind, const iodim *d, INT *rs, INT *cs)
{
if (kind == R2HC) {
*rs = d->is;
*cs = d->os;
}
else {
A(kind == HC2R);
*rs = d->os;
*cs = d->is;
}
}

View File

@@ -0,0 +1,43 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/rdft.h"
/* like X(tensor_max_index), but takes into account the special n/2+1
final dimension for the complex output/input of an R2HC/HC2R transform. */
INT X(rdft2_tensor_max_index)(const tensor *sz, rdft_kind k)
{
int i;
INT n = 0;
A(FINITE_RNK(sz->rnk));
for (i = 0; i + 1 < sz->rnk; ++i) {
const iodim *p = sz->dims + i;
n += (p->n - 1) * X(imax)(X(iabs)(p->is), X(iabs)(p->os));
}
if (i < sz->rnk) {
const iodim *p = sz->dims + i;
INT is, os;
X(rdft2_strides)(k, p, &is, &os);
n += X(imax)((p->n - 1) * X(iabs)(is), (p->n/2) * X(iabs)(os));
}
return n;
}

View File

@@ -0,0 +1,7 @@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = r2cf r2cb r2r
noinst_LTLIBRARIES = librdft_scalar.la
librdft_scalar_la_SOURCES = hb.h r2cb.h r2cbIII.h hf.h hfb.c r2c.c \
r2cf.h r2cfII.h r2r.c r2r.h hc2c.c hc2cf.h hc2cb.h

View File

@@ -0,0 +1,766 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = rdft/scalar
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
librdft_scalar_la_LIBADD =
am_librdft_scalar_la_OBJECTS = hfb.lo r2c.lo r2r.lo hc2c.lo
librdft_scalar_la_OBJECTS = $(am_librdft_scalar_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/hc2c.Plo ./$(DEPDIR)/hfb.Plo \
./$(DEPDIR)/r2c.Plo ./$(DEPDIR)/r2r.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(librdft_scalar_la_SOURCES)
DIST_SOURCES = $(librdft_scalar_la_SOURCES)
RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
ctags-recursive dvi-recursive html-recursive info-recursive \
install-data-recursive install-dvi-recursive \
install-exec-recursive install-html-recursive \
install-info-recursive install-pdf-recursive \
install-ps-recursive install-recursive installcheck-recursive \
installdirs-recursive pdf-recursive ps-recursive \
tags-recursive uninstall-recursive
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
distclean-recursive maintainer-clean-recursive
am__recursive_targets = \
$(RECURSIVE_TARGETS) \
$(RECURSIVE_CLEAN_TARGETS) \
$(am__extra_recursive_targets)
AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
distdir distdir-am
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
am__relativize = \
dir0=`pwd`; \
sed_first='s,^\([^/]*\)/.*$$,\1,'; \
sed_rest='s,^[^/]*/*,,'; \
sed_last='s,^.*/\([^/]*\)$$,\1,'; \
sed_butlast='s,/*[^/]*$$,,'; \
while test -n "$$dir1"; do \
first=`echo "$$dir1" | sed -e "$$sed_first"`; \
if test "$$first" != "."; then \
if test "$$first" = ".."; then \
dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
else \
first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
if test "$$first2" = "$$first"; then \
dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
else \
dir2="../$$dir2"; \
fi; \
dir0="$$dir0"/"$$first"; \
fi; \
fi; \
dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
done; \
reldir="$$dir2"
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AM_CPPFLAGS = -I $(top_srcdir)
SUBDIRS = r2cf r2cb r2r
noinst_LTLIBRARIES = librdft_scalar.la
librdft_scalar_la_SOURCES = hb.h r2cb.h r2cbIII.h hf.h hfb.c r2c.c \
r2cf.h r2cfII.h r2r.c r2r.h hc2c.c hc2cf.h hc2cb.h
all: all-recursive
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/scalar/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu rdft/scalar/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
librdft_scalar.la: $(librdft_scalar_la_OBJECTS) $(librdft_scalar_la_DEPENDENCIES) $(EXTRA_librdft_scalar_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(librdft_scalar_la_OBJECTS) $(librdft_scalar_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hc2c.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hfb.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2c.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r2r.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
# This directory's subdirectories are mostly independent; you can cd
# into them and run 'make' without going through this Makefile.
# To change the values of 'make' variables: instead of editing Makefiles,
# (1) if the variable is set in 'config.status', edit 'config.status'
# (which will cause the Makefiles to be regenerated when you run 'make');
# (2) otherwise, pass the desired values on the 'make' command line.
$(am__recursive_targets):
@fail=; \
if $(am__make_keepgoing); then \
failcom='fail=yes'; \
else \
failcom='exit 1'; \
fi; \
dot_seen=no; \
target=`echo $@ | sed s/-recursive//`; \
case "$@" in \
distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
*) list='$(SUBDIRS)' ;; \
esac; \
for subdir in $$list; do \
echo "Making $$target in $$subdir"; \
if test "$$subdir" = "."; then \
dot_seen=yes; \
local_target="$$target-am"; \
else \
local_target="$$target"; \
fi; \
($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
|| eval $$failcom; \
done; \
if test "$$dot_seen" = "no"; then \
$(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
fi; test -z "$$fail"
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-recursive
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
include_option=--etags-include; \
empty_fix=.; \
else \
include_option=--include; \
empty_fix=; \
fi; \
list='$(SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
test ! -f $$subdir/TAGS || \
set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
fi; \
done; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-recursive
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-recursive
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
if test "$$subdir" = .; then :; else \
$(am__make_dryrun) \
|| test -d "$(distdir)/$$subdir" \
|| $(MKDIR_P) "$(distdir)/$$subdir" \
|| exit 1; \
dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
$(am__relativize); \
new_distdir=$$reldir; \
dir1=$$subdir; dir2="$(top_distdir)"; \
$(am__relativize); \
new_top_distdir=$$reldir; \
echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
($(am__cd) $$subdir && \
$(MAKE) $(AM_MAKEFLAGS) \
top_distdir="$$new_top_distdir" \
distdir="$$new_distdir" \
am__remove_distdir=: \
am__skip_length_check=: \
am__skip_mode_fix=: \
distdir) \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-recursive
all-am: Makefile $(LTLIBRARIES)
installdirs: installdirs-recursive
installdirs-am:
install: install-recursive
install-exec: install-exec-recursive
install-data: install-data-recursive
uninstall: uninstall-recursive
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-recursive
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-recursive
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-recursive
-rm -f ./$(DEPDIR)/hc2c.Plo
-rm -f ./$(DEPDIR)/hfb.Plo
-rm -f ./$(DEPDIR)/r2c.Plo
-rm -f ./$(DEPDIR)/r2r.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-recursive
dvi-am:
html: html-recursive
html-am:
info: info-recursive
info-am:
install-data-am:
install-dvi: install-dvi-recursive
install-dvi-am:
install-exec-am:
install-html: install-html-recursive
install-html-am:
install-info: install-info-recursive
install-info-am:
install-man:
install-pdf: install-pdf-recursive
install-pdf-am:
install-ps: install-ps-recursive
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-recursive
-rm -f ./$(DEPDIR)/hc2c.Plo
-rm -f ./$(DEPDIR)/hfb.Plo
-rm -f ./$(DEPDIR)/r2c.Plo
-rm -f ./$(DEPDIR)/r2r.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-recursive
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-recursive
pdf-am:
ps: ps-recursive
ps-am:
uninstall-am:
.MAKE: $(am__recursive_targets) install-am install-strip
.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
am--depfiles check check-am clean clean-generic clean-libtool \
clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
distclean-compile distclean-generic distclean-libtool \
distclean-tags distdir dvi dvi-am html html-am info info-am \
install install-am install-data install-data-am install-dvi \
install-dvi-am install-exec install-exec-am install-html \
install-html-am install-info install-info-am install-man \
install-pdf install-pdf-am install-ps install-ps-am \
install-strip installcheck installcheck-am installdirs \
installdirs-am maintainer-clean maintainer-clean-generic \
mostlyclean mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

View File

@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(rdft_hb_genus)
extern const hc2hc_genus GENUS;

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/codelet-rdft.h"
#include "rdft/scalar/hc2cf.h"
static int okp(const R *Rp, const R *Ip, const R *Rm, const R *Im,
INT rs, INT mb, INT me, INT ms,
const planner *plnr)
{
UNUSED(Rp); UNUSED(Ip); UNUSED(Rm); UNUSED(Im);
UNUSED(rs); UNUSED(mb); UNUSED(me); UNUSED(ms); UNUSED(plnr);
return 1;
}
const hc2c_genus GENUS = { okp, R2HC, 1 };
#undef GENUS
#include "rdft/scalar/hc2cb.h"
const hc2c_genus GENUS = { okp, HC2R, 1 };

View File

@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(rdft_hc2cb_genus)
extern const hc2c_genus GENUS;

View File

@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(rdft_hc2cf_genus)
extern const hc2c_genus GENUS;

View File

@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(rdft_hf_genus)
extern const hc2hc_genus GENUS;

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/codelet-rdft.h"
#include "rdft/scalar/hf.h"
const hc2hc_genus GENUS = { R2HC, 1 };
#undef GENUS
#include "rdft/scalar/hb.h"
const hc2hc_genus GENUS = { HC2R, 1 };

View File

@@ -0,0 +1,37 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "rdft/codelet-rdft.h"
#include "rdft/scalar/r2cf.h"
const kr2c_genus GENUS = { R2HC, 1 };
#undef GENUS
#include "rdft/scalar/r2cfII.h"
const kr2c_genus GENUS = { R2HCII, 1 };
#undef GENUS
#include "rdft/scalar/r2cb.h"
const kr2c_genus GENUS = { HC2R, 1 };
#undef GENUS
#include "rdft/scalar/r2cbIII.h"
const kr2c_genus GENUS = { HC2RIII, 1 };
#undef GENUS

View File

@@ -0,0 +1,23 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#define GENUS X(rdft_r2cb_genus)
extern const kr2c_genus GENUS;

View File

@@ -0,0 +1,109 @@
# This Makefile.am specifies a set of codelets, efficient transforms
# of small sizes, that are used as building blocks (kernels) by FFTW
# to build up large transforms, as well as the options for generating
# and compiling them.
# You can customize FFTW for special needs, e.g. to handle certain
# sizes more efficiently, by adding new codelets to the lists of those
# included by default. If you change the list of codelets, any new
# ones you added will be automatically generated when you run the
# bootstrap script (see "Generating your own code" in the FFTW
# manual).
###########################################################################
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = librdft_scalar_r2cb.la
###########################################################################
# r2cb_<n> is a hard-coded complex-to-real FFT of size <n> (base cases
# of real-output FFT recursion)
R2CB = r2cb_2.c r2cb_3.c r2cb_4.c r2cb_5.c r2cb_6.c r2cb_7.c r2cb_8.c \
r2cb_9.c r2cb_10.c r2cb_11.c r2cb_12.c r2cb_13.c r2cb_14.c r2cb_15.c \
r2cb_16.c r2cb_32.c r2cb_64.c r2cb_128.c r2cb_20.c r2cb_25.c
# r2cb_30.c r2cb_40.c r2cb_50.c
###########################################################################
# hb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
# step for a real-output FFT. Every hb codelet must have a
# corresponding r2cbIII codelet (see below)!
HB = hb_2.c hb_3.c hb_4.c hb_5.c hb_6.c hb_7.c hb_8.c hb_9.c \
hb_10.c hb_12.c hb_15.c hb_16.c hb_32.c hb_64.c \
hb_20.c hb_25.c # hb_30.c hb_40.c hb_50.c
# like hb, but generates part of its trig table on the fly (good for large n)
HB2 = hb2_4.c hb2_8.c hb2_16.c hb2_32.c \
hb2_5.c hb2_20.c hb2_25.c
# an r2cb transform where the output is shifted by half a sample (input
# is multiplied by a phase). This is needed as part of the DIF recursion;
# every hb_<r> or hb2_<r> codelet should have a corresponding r2cbIII_<r>
R2CBIII = r2cbIII_2.c r2cbIII_3.c r2cbIII_4.c r2cbIII_5.c r2cbIII_6.c \
r2cbIII_7.c r2cbIII_8.c r2cbIII_9.c r2cbIII_10.c r2cbIII_12.c \
r2cbIII_15.c r2cbIII_16.c r2cbIII_32.c r2cbIII_64.c \
r2cbIII_20.c r2cbIII_25.c # r2cbIII_30.c r2cbIII_40.c r2cbIII_50.c
###########################################################################
# hc2cb_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF
# step for a real-input FFT with rdft2-style output. <r> must be even.
HC2CB = hc2cb_2.c hc2cb_4.c hc2cb_6.c hc2cb_8.c hc2cb_10.c hc2cb_12.c \
hc2cb_16.c hc2cb_32.c \
hc2cb_20.c # hc2cb_30.c
HC2CBDFT = hc2cbdft_2.c hc2cbdft_4.c hc2cbdft_6.c hc2cbdft_8.c \
hc2cbdft_10.c hc2cbdft_12.c hc2cbdft_16.c hc2cbdft_32.c \
hc2cbdft_20.c # hc2cbdft_30.c
# like hc2cb, but generates part of its trig table on the fly (good
# for large n)
HC2CB2 = hc2cb2_4.c hc2cb2_8.c hc2cb2_16.c hc2cb2_32.c \
hc2cb2_20.c # hc2cb2_30.c
HC2CBDFT2 = hc2cbdft2_4.c hc2cbdft2_8.c hc2cbdft2_16.c hc2cbdft2_32.c \
hc2cbdft2_20.c # hc2cbdft2_30.c
###########################################################################
ALL_CODELETS = $(R2CB) $(HB) $(HB2) $(R2CBIII) $(HC2CB) $(HC2CB2) \
$(HC2CBDFT) $(HC2CBDFT2)
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
librdft_scalar_r2cb_la_SOURCES = $(BUILT_SOURCES)
SOLVTAB_NAME = X(solvtab_rdft_r2cb)
XRENAME=X
# special rules for regenerating codelets.
include $(top_srcdir)/support/Makefile.codelets
if MAINTAINER_MODE
FLAGS_R2CB=$(RDFT_FLAGS_COMMON) -sign 1
FLAGS_HB=$(RDFT_FLAGS_COMMON) -sign 1
FLAGS_HB2=$(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
FLAGS_HC2CB=$(RDFT_FLAGS_COMMON) -sign 1
FLAGS_HC2CB2=$(RDFT_FLAGS_COMMON) -sign 1 -twiddle-log3 -precompute-twiddles
FLAGS_R2CBIII=$(RDFT_FLAGS_COMMON) -sign 1
r2cb_%.c: $(CODELET_DEPS) $(GEN_R2CB)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cb_$* -include "rdft/scalar/r2cb.h") | $(ADD_DATE) | $(INDENT) >$@
hb_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB) -n $* -dif -name hb_$* -include "rdft/scalar/hb.h") | $(ADD_DATE) | $(INDENT) >$@
hb2_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HB2) -n $* -dif -name hb2_$* -include "rdft/scalar/hb.h") | $(ADD_DATE) | $(INDENT) >$@
r2cbIII_%.c: $(CODELET_DEPS) $(GEN_R2CB)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CB) $(FLAGS_R2CB) -n $* -name r2cbIII_$* -dft-III -include "rdft/scalar/r2cbIII.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cb_%.c: $(CODELET_DEPS) $(GEN_HC2C)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB) -n $* -dif -name hc2cb_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cb2_%.c: $(CODELET_DEPS) $(GEN_HC2C)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CB2) -n $* -dif -name hc2cb2_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cbdft_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cbdft2_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CB) -n $* -dif -name hc2cbdft2_$* -include "rdft/scalar/hc2cb.h") | $(ADD_DATE) | $(INDENT) >$@
endif # MAINTAINER_MODE

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,183 @@
#include "kernel/ifftw.h"
extern void X(codelet_r2cb_2)(planner *);
extern void X(codelet_r2cb_3)(planner *);
extern void X(codelet_r2cb_4)(planner *);
extern void X(codelet_r2cb_5)(planner *);
extern void X(codelet_r2cb_6)(planner *);
extern void X(codelet_r2cb_7)(planner *);
extern void X(codelet_r2cb_8)(planner *);
extern void X(codelet_r2cb_9)(planner *);
extern void X(codelet_r2cb_10)(planner *);
extern void X(codelet_r2cb_11)(planner *);
extern void X(codelet_r2cb_12)(planner *);
extern void X(codelet_r2cb_13)(planner *);
extern void X(codelet_r2cb_14)(planner *);
extern void X(codelet_r2cb_15)(planner *);
extern void X(codelet_r2cb_16)(planner *);
extern void X(codelet_r2cb_32)(planner *);
extern void X(codelet_r2cb_64)(planner *);
extern void X(codelet_r2cb_128)(planner *);
extern void X(codelet_r2cb_20)(planner *);
extern void X(codelet_r2cb_25)(planner *);
extern void X(codelet_hb_2)(planner *);
extern void X(codelet_hb_3)(planner *);
extern void X(codelet_hb_4)(planner *);
extern void X(codelet_hb_5)(planner *);
extern void X(codelet_hb_6)(planner *);
extern void X(codelet_hb_7)(planner *);
extern void X(codelet_hb_8)(planner *);
extern void X(codelet_hb_9)(planner *);
extern void X(codelet_hb_10)(planner *);
extern void X(codelet_hb_12)(planner *);
extern void X(codelet_hb_15)(planner *);
extern void X(codelet_hb_16)(planner *);
extern void X(codelet_hb_32)(planner *);
extern void X(codelet_hb_64)(planner *);
extern void X(codelet_hb_20)(planner *);
extern void X(codelet_hb_25)(planner *);
extern void X(codelet_hb2_4)(planner *);
extern void X(codelet_hb2_8)(planner *);
extern void X(codelet_hb2_16)(planner *);
extern void X(codelet_hb2_32)(planner *);
extern void X(codelet_hb2_5)(planner *);
extern void X(codelet_hb2_20)(planner *);
extern void X(codelet_hb2_25)(planner *);
extern void X(codelet_r2cbIII_2)(planner *);
extern void X(codelet_r2cbIII_3)(planner *);
extern void X(codelet_r2cbIII_4)(planner *);
extern void X(codelet_r2cbIII_5)(planner *);
extern void X(codelet_r2cbIII_6)(planner *);
extern void X(codelet_r2cbIII_7)(planner *);
extern void X(codelet_r2cbIII_8)(planner *);
extern void X(codelet_r2cbIII_9)(planner *);
extern void X(codelet_r2cbIII_10)(planner *);
extern void X(codelet_r2cbIII_12)(planner *);
extern void X(codelet_r2cbIII_15)(planner *);
extern void X(codelet_r2cbIII_16)(planner *);
extern void X(codelet_r2cbIII_32)(planner *);
extern void X(codelet_r2cbIII_64)(planner *);
extern void X(codelet_r2cbIII_20)(planner *);
extern void X(codelet_r2cbIII_25)(planner *);
extern void X(codelet_hc2cb_2)(planner *);
extern void X(codelet_hc2cb_4)(planner *);
extern void X(codelet_hc2cb_6)(planner *);
extern void X(codelet_hc2cb_8)(planner *);
extern void X(codelet_hc2cb_10)(planner *);
extern void X(codelet_hc2cb_12)(planner *);
extern void X(codelet_hc2cb_16)(planner *);
extern void X(codelet_hc2cb_32)(planner *);
extern void X(codelet_hc2cb_20)(planner *);
extern void X(codelet_hc2cb2_4)(planner *);
extern void X(codelet_hc2cb2_8)(planner *);
extern void X(codelet_hc2cb2_16)(planner *);
extern void X(codelet_hc2cb2_32)(planner *);
extern void X(codelet_hc2cb2_20)(planner *);
extern void X(codelet_hc2cbdft_2)(planner *);
extern void X(codelet_hc2cbdft_4)(planner *);
extern void X(codelet_hc2cbdft_6)(planner *);
extern void X(codelet_hc2cbdft_8)(planner *);
extern void X(codelet_hc2cbdft_10)(planner *);
extern void X(codelet_hc2cbdft_12)(planner *);
extern void X(codelet_hc2cbdft_16)(planner *);
extern void X(codelet_hc2cbdft_32)(planner *);
extern void X(codelet_hc2cbdft_20)(planner *);
extern void X(codelet_hc2cbdft2_4)(planner *);
extern void X(codelet_hc2cbdft2_8)(planner *);
extern void X(codelet_hc2cbdft2_16)(planner *);
extern void X(codelet_hc2cbdft2_32)(planner *);
extern void X(codelet_hc2cbdft2_20)(planner *);
extern const solvtab X(solvtab_rdft_r2cb);
const solvtab X(solvtab_rdft_r2cb) = {
SOLVTAB(X(codelet_r2cb_2)),
SOLVTAB(X(codelet_r2cb_3)),
SOLVTAB(X(codelet_r2cb_4)),
SOLVTAB(X(codelet_r2cb_5)),
SOLVTAB(X(codelet_r2cb_6)),
SOLVTAB(X(codelet_r2cb_7)),
SOLVTAB(X(codelet_r2cb_8)),
SOLVTAB(X(codelet_r2cb_9)),
SOLVTAB(X(codelet_r2cb_10)),
SOLVTAB(X(codelet_r2cb_11)),
SOLVTAB(X(codelet_r2cb_12)),
SOLVTAB(X(codelet_r2cb_13)),
SOLVTAB(X(codelet_r2cb_14)),
SOLVTAB(X(codelet_r2cb_15)),
SOLVTAB(X(codelet_r2cb_16)),
SOLVTAB(X(codelet_r2cb_32)),
SOLVTAB(X(codelet_r2cb_64)),
SOLVTAB(X(codelet_r2cb_128)),
SOLVTAB(X(codelet_r2cb_20)),
SOLVTAB(X(codelet_r2cb_25)),
SOLVTAB(X(codelet_hb_2)),
SOLVTAB(X(codelet_hb_3)),
SOLVTAB(X(codelet_hb_4)),
SOLVTAB(X(codelet_hb_5)),
SOLVTAB(X(codelet_hb_6)),
SOLVTAB(X(codelet_hb_7)),
SOLVTAB(X(codelet_hb_8)),
SOLVTAB(X(codelet_hb_9)),
SOLVTAB(X(codelet_hb_10)),
SOLVTAB(X(codelet_hb_12)),
SOLVTAB(X(codelet_hb_15)),
SOLVTAB(X(codelet_hb_16)),
SOLVTAB(X(codelet_hb_32)),
SOLVTAB(X(codelet_hb_64)),
SOLVTAB(X(codelet_hb_20)),
SOLVTAB(X(codelet_hb_25)),
SOLVTAB(X(codelet_hb2_4)),
SOLVTAB(X(codelet_hb2_8)),
SOLVTAB(X(codelet_hb2_16)),
SOLVTAB(X(codelet_hb2_32)),
SOLVTAB(X(codelet_hb2_5)),
SOLVTAB(X(codelet_hb2_20)),
SOLVTAB(X(codelet_hb2_25)),
SOLVTAB(X(codelet_r2cbIII_2)),
SOLVTAB(X(codelet_r2cbIII_3)),
SOLVTAB(X(codelet_r2cbIII_4)),
SOLVTAB(X(codelet_r2cbIII_5)),
SOLVTAB(X(codelet_r2cbIII_6)),
SOLVTAB(X(codelet_r2cbIII_7)),
SOLVTAB(X(codelet_r2cbIII_8)),
SOLVTAB(X(codelet_r2cbIII_9)),
SOLVTAB(X(codelet_r2cbIII_10)),
SOLVTAB(X(codelet_r2cbIII_12)),
SOLVTAB(X(codelet_r2cbIII_15)),
SOLVTAB(X(codelet_r2cbIII_16)),
SOLVTAB(X(codelet_r2cbIII_32)),
SOLVTAB(X(codelet_r2cbIII_64)),
SOLVTAB(X(codelet_r2cbIII_20)),
SOLVTAB(X(codelet_r2cbIII_25)),
SOLVTAB(X(codelet_hc2cb_2)),
SOLVTAB(X(codelet_hc2cb_4)),
SOLVTAB(X(codelet_hc2cb_6)),
SOLVTAB(X(codelet_hc2cb_8)),
SOLVTAB(X(codelet_hc2cb_10)),
SOLVTAB(X(codelet_hc2cb_12)),
SOLVTAB(X(codelet_hc2cb_16)),
SOLVTAB(X(codelet_hc2cb_32)),
SOLVTAB(X(codelet_hc2cb_20)),
SOLVTAB(X(codelet_hc2cb2_4)),
SOLVTAB(X(codelet_hc2cb2_8)),
SOLVTAB(X(codelet_hc2cb2_16)),
SOLVTAB(X(codelet_hc2cb2_32)),
SOLVTAB(X(codelet_hc2cb2_20)),
SOLVTAB(X(codelet_hc2cbdft_2)),
SOLVTAB(X(codelet_hc2cbdft_4)),
SOLVTAB(X(codelet_hc2cbdft_6)),
SOLVTAB(X(codelet_hc2cbdft_8)),
SOLVTAB(X(codelet_hc2cbdft_10)),
SOLVTAB(X(codelet_hc2cbdft_12)),
SOLVTAB(X(codelet_hc2cbdft_16)),
SOLVTAB(X(codelet_hc2cbdft_32)),
SOLVTAB(X(codelet_hc2cbdft_20)),
SOLVTAB(X(codelet_hc2cbdft2_4)),
SOLVTAB(X(codelet_hc2cbdft2_8)),
SOLVTAB(X(codelet_hc2cbdft2_16)),
SOLVTAB(X(codelet_hc2cbdft2_32)),
SOLVTAB(X(codelet_hc2cbdft2_20)),
SOLVTAB_END
};

View File

@@ -0,0 +1,858 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:55 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include rdft/scalar/hb.h */
/*
* This function contains 196 FP additions, 134 FP multiplications,
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
* 93 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E Tv, Tw, T2z, T2C, TB, TF, Ty, Tz, T1V, TA, T2G, T3Q, T3C, T3g, T3L;
E T30, T3m, T3z, T3w, T3s, T1X, T1Y, T2u, T2c, T2p, TE, TG, T1G, T1o, T1D;
{
E T3f, T3l, T2F, T3r, T2Z, T3v, TD, Tx;
Tv = W[0];
Tw = W[2];
Tx = Tv * Tw;
T2z = W[6];
T3f = Tv * T2z;
T2C = W[7];
T3l = Tv * T2C;
TB = W[4];
T2F = Tv * TB;
T3r = Tw * TB;
TF = W[5];
T2Z = Tv * TF;
T3v = Tw * TF;
Ty = W[1];
Tz = W[3];
TD = Tv * Tz;
T1V = FMA(Ty, Tz, Tx);
TA = FNMS(Ty, Tz, Tx);
T2G = FNMS(Ty, TF, T2F);
T3Q = FMA(Tz, TB, T3v);
T3C = FNMS(Ty, TB, T2Z);
T3g = FMA(Ty, T2C, T3f);
T3L = FNMS(Tz, TF, T3r);
T30 = FMA(Ty, TB, T2Z);
T3m = FNMS(Ty, T2z, T3l);
T3z = FMA(Ty, TF, T2F);
T3w = FNMS(Tz, TB, T3v);
T3s = FMA(Tz, TF, T3r);
{
E T1W, T2b, TC, T1n;
T1W = T1V * TB;
T2b = T1V * TF;
T1X = FNMS(Ty, Tw, TD);
T1Y = FNMS(T1X, TF, T1W);
T2u = FNMS(T1X, TB, T2b);
T2c = FMA(T1X, TB, T2b);
T2p = FMA(T1X, TF, T1W);
TC = TA * TB;
T1n = TA * TF;
TE = FMA(Ty, Tw, TD);
TG = FNMS(TE, TF, TC);
T1G = FNMS(TE, TB, T1n);
T1o = FMA(TE, TB, T1n);
T1D = FMA(TE, TF, TC);
}
}
{
E TL, T1Z, T2d, T1t, T31, T34, T3n, T3D, T3E, T3R, T1w, T20, Tf, T3M, T2L;
E T3h, TW, T2e, T3G, T3H, T3N, T2Q, T36, T2V, T37, Tu, T3S, T18, T1z, T24;
E T2g, T27, T2h, T1j, T1y;
{
E T3, TH, TU, T2I, T1s, T32, T6, T1p, Ta, TM, TK, T33, TP, T2J, Td;
E TR;
{
E T1, T2, TS, TT;
T1 = cr[0];
T2 = ci[WS(rs, 7)];
T3 = T1 + T2;
TH = T1 - T2;
TS = ci[WS(rs, 9)];
TT = cr[WS(rs, 14)];
TU = TS + TT;
T2I = TS - TT;
}
{
E T1q, T1r, T4, T5;
T1q = ci[WS(rs, 15)];
T1r = cr[WS(rs, 8)];
T1s = T1q + T1r;
T32 = T1q - T1r;
T4 = cr[WS(rs, 4)];
T5 = ci[WS(rs, 3)];
T6 = T4 + T5;
T1p = T4 - T5;
}
{
E T8, T9, TI, TJ;
T8 = cr[WS(rs, 2)];
T9 = ci[WS(rs, 5)];
Ta = T8 + T9;
TM = T8 - T9;
TI = ci[WS(rs, 11)];
TJ = cr[WS(rs, 12)];
TK = TI + TJ;
T33 = TI - TJ;
}
{
E TN, TO, Tb, Tc;
TN = ci[WS(rs, 13)];
TO = cr[WS(rs, 10)];
TP = TN + TO;
T2J = TN - TO;
Tb = ci[WS(rs, 1)];
Tc = cr[WS(rs, 6)];
Td = Tb + Tc;
TR = Tb - Tc;
}
TL = TH - TK;
T1Z = TH + TK;
T2d = T1s - T1p;
T1t = T1p + T1s;
T31 = Ta - Td;
T34 = T32 - T33;
T3n = T34 - T31;
{
E T1u, T1v, T7, Te;
T3D = T32 + T33;
T3E = T2J + T2I;
T3R = T3D - T3E;
T1u = TM + TP;
T1v = TR + TU;
T1w = T1u - T1v;
T20 = T1u + T1v;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T3M = T7 - Te;
{
E T2H, T2K, TQ, TV;
T2H = T3 - T6;
T2K = T2I - T2J;
T2L = T2H + T2K;
T3h = T2H - T2K;
TQ = TM - TP;
TV = TR - TU;
TW = TQ + TV;
T2e = TQ - TV;
}
}
}
{
E Ti, T1e, T1c, T2N, T1h, T2O, Tl, T19, Tp, T13, T11, T2S, T16, T2T, Ts;
E TY, T2M, T2P;
{
E Tg, Th, T1a, T1b;
Tg = cr[WS(rs, 1)];
Th = ci[WS(rs, 6)];
Ti = Tg + Th;
T1e = Tg - Th;
T1a = ci[WS(rs, 14)];
T1b = cr[WS(rs, 9)];
T1c = T1a + T1b;
T2N = T1a - T1b;
}
{
E T1f, T1g, Tj, Tk;
T1f = ci[WS(rs, 10)];
T1g = cr[WS(rs, 13)];
T1h = T1f + T1g;
T2O = T1f - T1g;
Tj = cr[WS(rs, 5)];
Tk = ci[WS(rs, 2)];
Tl = Tj + Tk;
T19 = Tj - Tk;
}
{
E Tn, To, TZ, T10;
Tn = ci[0];
To = cr[WS(rs, 7)];
Tp = Tn + To;
T13 = Tn - To;
TZ = ci[WS(rs, 8)];
T10 = cr[WS(rs, 15)];
T11 = TZ + T10;
T2S = TZ - T10;
}
{
E T14, T15, Tq, Tr;
T14 = ci[WS(rs, 12)];
T15 = cr[WS(rs, 11)];
T16 = T14 + T15;
T2T = T14 - T15;
Tq = cr[WS(rs, 3)];
Tr = ci[WS(rs, 4)];
Ts = Tq + Tr;
TY = Tq - Tr;
}
T3G = T2N + T2O;
T3H = T2S + T2T;
T3N = T3H - T3G;
T2M = Ti - Tl;
T2P = T2N - T2O;
T2Q = T2M - T2P;
T36 = T2M + T2P;
{
E T2R, T2U, Tm, Tt;
T2R = Tp - Ts;
T2U = T2S - T2T;
T2V = T2R + T2U;
T37 = T2U - T2R;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T3S = Tm - Tt;
}
{
E T12, T17, T22, T23;
T12 = TY - T11;
T17 = T13 - T16;
T18 = FNMS(KP414213562, T17, T12);
T1z = FMA(KP414213562, T12, T17);
T22 = T1c - T19;
T23 = T1e + T1h;
T24 = FNMS(KP414213562, T23, T22);
T2g = FMA(KP414213562, T22, T23);
}
{
E T25, T26, T1d, T1i;
T25 = TY + T11;
T26 = T13 + T16;
T27 = FNMS(KP414213562, T26, T25);
T2h = FMA(KP414213562, T25, T26);
T1d = T19 + T1c;
T1i = T1e - T1h;
T1j = FMA(KP414213562, T1i, T1d);
T1y = FNMS(KP414213562, T1d, T1i);
}
}
cr[0] = Tf + Tu;
{
E T3B, T3K, T3F, T3I, T3J, T3A;
T3A = Tf - Tu;
T3B = T3z * T3A;
T3K = T3C * T3A;
T3F = T3D + T3E;
T3I = T3G + T3H;
T3J = T3F - T3I;
ci[0] = T3F + T3I;
ci[WS(rs, 8)] = FMA(T3z, T3J, T3K);
cr[WS(rs, 8)] = FNMS(T3C, T3J, T3B);
}
{
E T3O, T3P, T3T, T3U;
T3O = T3M - T3N;
T3P = T3L * T3O;
T3T = T3R - T3S;
T3U = T3L * T3T;
cr[WS(rs, 12)] = FNMS(T3Q, T3T, T3P);
ci[WS(rs, 12)] = FMA(T3Q, T3O, T3U);
}
{
E T3V, T3W, T3X, T3Y;
T3V = T3M + T3N;
T3W = TA * T3V;
T3X = T3S + T3R;
T3Y = TA * T3X;
cr[WS(rs, 4)] = FNMS(TE, T3X, T3W);
ci[WS(rs, 4)] = FMA(TE, T3V, T3Y);
}
{
E T3j, T3t, T3p, T3x, T3i, T3o;
T3i = T37 - T36;
T3j = FNMS(KP707106781, T3i, T3h);
T3t = FMA(KP707106781, T3i, T3h);
T3o = T2Q - T2V;
T3p = FNMS(KP707106781, T3o, T3n);
T3x = FMA(KP707106781, T3o, T3n);
{
E T3k, T3q, T3u, T3y;
T3k = T3g * T3j;
cr[WS(rs, 14)] = FNMS(T3m, T3p, T3k);
T3q = T3g * T3p;
ci[WS(rs, 14)] = FMA(T3m, T3j, T3q);
T3u = T3s * T3t;
cr[WS(rs, 6)] = FNMS(T3w, T3x, T3u);
T3y = T3s * T3x;
ci[WS(rs, 6)] = FMA(T3w, T3t, T3y);
}
}
{
E T2X, T3b, T39, T3d, T2W, T35, T38;
T2W = T2Q + T2V;
T2X = FNMS(KP707106781, T2W, T2L);
T3b = FMA(KP707106781, T2W, T2L);
T35 = T31 + T34;
T38 = T36 + T37;
T39 = FNMS(KP707106781, T38, T35);
T3d = FMA(KP707106781, T38, T35);
{
E T2Y, T3a, T3c, T3e;
T2Y = T2G * T2X;
cr[WS(rs, 10)] = FNMS(T30, T39, T2Y);
T3a = T30 * T2X;
ci[WS(rs, 10)] = FMA(T2G, T39, T3a);
T3c = T1V * T3b;
cr[WS(rs, 2)] = FNMS(T1X, T3d, T3c);
T3e = T1X * T3b;
ci[WS(rs, 2)] = FMA(T1V, T3d, T3e);
}
}
{
E T29, T2l, T2j, T2n;
{
E T21, T28, T2f, T2i;
T21 = FNMS(KP707106781, T20, T1Z);
T28 = T24 + T27;
T29 = FMA(KP923879532, T28, T21);
T2l = FNMS(KP923879532, T28, T21);
T2f = FMA(KP707106781, T2e, T2d);
T2i = T2g - T2h;
T2j = FNMS(KP923879532, T2i, T2f);
T2n = FMA(KP923879532, T2i, T2f);
}
{
E T2a, T2k, T2m, T2o;
T2a = T1Y * T29;
cr[WS(rs, 11)] = FNMS(T2c, T2j, T2a);
T2k = T2c * T29;
ci[WS(rs, 11)] = FMA(T1Y, T2j, T2k);
T2m = Tw * T2l;
cr[WS(rs, 3)] = FNMS(Tz, T2n, T2m);
T2o = Tz * T2l;
ci[WS(rs, 3)] = FMA(Tw, T2n, T2o);
}
}
{
E T1l, T1E, T1B, T1H;
{
E TX, T1k, T1x, T1A;
TX = FNMS(KP707106781, TW, TL);
T1k = T18 - T1j;
T1l = FNMS(KP923879532, T1k, TX);
T1E = FMA(KP923879532, T1k, TX);
T1x = FNMS(KP707106781, T1w, T1t);
T1A = T1y - T1z;
T1B = FNMS(KP923879532, T1A, T1x);
T1H = FMA(KP923879532, T1A, T1x);
}
{
E T1m, T1C, T1F, T1I;
T1m = TG * T1l;
cr[WS(rs, 13)] = FNMS(T1o, T1B, T1m);
T1C = T1o * T1l;
ci[WS(rs, 13)] = FMA(TG, T1B, T1C);
T1F = T1D * T1E;
cr[WS(rs, 5)] = FNMS(T1G, T1H, T1F);
T1I = T1G * T1E;
ci[WS(rs, 5)] = FMA(T1D, T1H, T1I);
}
}
{
E T2s, T2A, T2x, T2D;
{
E T2q, T2r, T2v, T2w;
T2q = FMA(KP707106781, T20, T1Z);
T2r = T2g + T2h;
T2s = FNMS(KP923879532, T2r, T2q);
T2A = FMA(KP923879532, T2r, T2q);
T2v = FNMS(KP707106781, T2e, T2d);
T2w = T27 - T24;
T2x = FMA(KP923879532, T2w, T2v);
T2D = FNMS(KP923879532, T2w, T2v);
}
{
E T2t, T2y, T2B, T2E;
T2t = T2p * T2s;
cr[WS(rs, 7)] = FNMS(T2u, T2x, T2t);
T2y = T2p * T2x;
ci[WS(rs, 7)] = FMA(T2u, T2s, T2y);
T2B = T2z * T2A;
cr[WS(rs, 15)] = FNMS(T2C, T2D, T2B);
T2E = T2z * T2D;
ci[WS(rs, 15)] = FMA(T2C, T2A, T2E);
}
}
{
E T1L, T1R, T1P, T1T;
{
E T1J, T1K, T1N, T1O;
T1J = FMA(KP707106781, TW, TL);
T1K = T1y + T1z;
T1L = FNMS(KP923879532, T1K, T1J);
T1R = FMA(KP923879532, T1K, T1J);
T1N = FMA(KP707106781, T1w, T1t);
T1O = T1j + T18;
T1P = FNMS(KP923879532, T1O, T1N);
T1T = FMA(KP923879532, T1O, T1N);
}
{
E T1M, T1Q, T1S, T1U;
T1M = TB * T1L;
cr[WS(rs, 9)] = FNMS(TF, T1P, T1M);
T1Q = TB * T1P;
ci[WS(rs, 9)] = FMA(TF, T1L, T1Q);
T1S = Tv * T1R;
cr[WS(rs, 1)] = FNMS(Ty, T1T, T1S);
T1U = Tv * T1T;
ci[WS(rs, 1)] = FMA(Ty, T1R, T1U);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
void X(codelet_hb2_16) (planner *p) {
X(khc2hc_register) (p, hb2_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include rdft/scalar/hb.h */
/*
* This function contains 196 FP additions, 108 FP multiplications,
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
* 80 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
{
E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
{
E T1m, T1s, T1o, T1r;
Tv = W[0];
Ty = W[1];
T1l = W[2];
T1n = W[3];
T1m = Tv * T1l;
T1s = Ty * T1l;
T1o = Ty * T1n;
T1r = Tv * T1n;
T1p = T1m + T1o;
T1t = T1r - T1s;
T27 = T1r + T1s;
T25 = T1m - T1o;
Tz = W[5];
TA = Ty * Tz;
T1J = T1l * Tz;
T15 = Tv * Tz;
T1G = T1n * Tz;
Tw = W[4];
Tx = Tv * Tw;
T1K = T1n * Tw;
T16 = Ty * Tw;
T1F = T1l * Tw;
}
TB = Tx - TA;
T21 = T1J + T1K;
T1P = T15 - T16;
T1H = T1F + T1G;
T1X = T1F - T1G;
T17 = T15 + T16;
T1L = T1J - T1K;
T1N = Tx + TA;
T1v = W[6];
T1w = W[7];
T1x = FMA(Tv, T1v, Ty * T1w);
T1B = FNMS(Ty, T1v, Tv * T1w);
{
E T2D, T2E, T29, T2a;
T2D = T25 * Tz;
T2E = T27 * Tw;
T2F = T2D + T2E;
T2T = T2D - T2E;
T29 = T25 * Tw;
T2a = T27 * Tz;
T2b = T29 - T2a;
T2R = T29 + T2a;
}
{
E T3h, T3i, T33, T34;
T3h = T1p * Tz;
T3i = T1t * Tw;
T3j = T3h + T3i;
T3x = T3h - T3i;
T33 = T1p * Tw;
T34 = T1t * Tz;
T35 = T33 - T34;
T3t = T33 + T34;
}
}
{
E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
E T3e, T3o;
{
E T3, T2c, T1e, T2d, T6, T2G, T1b, T2H;
{
E T1, T2, T1c, T1d;
T1 = cr[0];
T2 = ci[WS(rs, 7)];
T3 = T1 + T2;
T2c = T1 - T2;
T1c = ci[WS(rs, 11)];
T1d = cr[WS(rs, 12)];
T1e = T1c - T1d;
T2d = T1c + T1d;
}
{
E T4, T5, T19, T1a;
T4 = cr[WS(rs, 4)];
T5 = ci[WS(rs, 3)];
T6 = T4 + T5;
T2G = T4 - T5;
T19 = ci[WS(rs, 15)];
T1a = cr[WS(rs, 8)];
T1b = T19 - T1a;
T2H = T19 + T1a;
}
T7 = T3 + T6;
T36 = T2c + T2d;
T3k = T2H - T2G;
TC = T3 - T6;
T1f = T1b - T1e;
T2e = T2c - T2d;
T2I = T2G + T2H;
T1Q = T1b + T1e;
}
{
E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
{
E T8, T9, TG, TH;
T8 = cr[WS(rs, 2)];
T9 = ci[WS(rs, 5)];
Ta = T8 + T9;
T2f = T8 - T9;
TG = ci[WS(rs, 13)];
TH = cr[WS(rs, 10)];
TI = TG - TH;
T2g = TG + TH;
}
{
E Tb, Tc, TD, TE;
Tb = ci[WS(rs, 1)];
Tc = cr[WS(rs, 6)];
Td = Tb + Tc;
T2i = Tb - Tc;
TD = ci[WS(rs, 9)];
TE = cr[WS(rs, 14)];
TF = TD - TE;
T2j = TD + TE;
}
Te = Ta + Td;
TJ = TF - TI;
T1R = TI + TF;
T18 = Ta - Td;
{
E T2J, T2K, T2h, T2k;
T2J = T2f + T2g;
T2K = T2i + T2j;
T2L = KP707106781 * (T2J - T2K);
T37 = KP707106781 * (T2J + T2K);
T2h = T2f - T2g;
T2k = T2i - T2j;
T2l = KP707106781 * (T2h + T2k);
T3l = KP707106781 * (T2h - T2k);
}
}
{
E Ti, T2x, TR, T2y, Tl, T2u, TO, T2v, TL, TS;
{
E Tg, Th, TP, TQ;
Tg = cr[WS(rs, 1)];
Th = ci[WS(rs, 6)];
Ti = Tg + Th;
T2x = Tg - Th;
TP = ci[WS(rs, 10)];
TQ = cr[WS(rs, 13)];
TR = TP - TQ;
T2y = TP + TQ;
}
{
E Tj, Tk, TM, TN;
Tj = cr[WS(rs, 5)];
Tk = ci[WS(rs, 2)];
Tl = Tj + Tk;
T2u = Tj - Tk;
TM = ci[WS(rs, 14)];
TN = cr[WS(rs, 9)];
TO = TM - TN;
T2v = TM + TN;
}
Tm = Ti + Tl;
T1T = TO + TR;
TL = Ti - Tl;
TS = TO - TR;
TT = TL - TS;
T1h = TL + TS;
{
E T2w, T2z, T39, T3a;
T2w = T2u + T2v;
T2z = T2x - T2y;
T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
T39 = T2x + T2y;
T3a = T2v - T2u;
T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
T3n = FMA(KP382683432, T3a, KP923879532 * T39);
}
}
{
E Tp, T2q, T10, T2r, Ts, T2n, TX, T2o, TU, T11;
{
E Tn, To, TY, TZ;
Tn = ci[0];
To = cr[WS(rs, 7)];
Tp = Tn + To;
T2q = Tn - To;
TY = ci[WS(rs, 12)];
TZ = cr[WS(rs, 11)];
T10 = TY - TZ;
T2r = TY + TZ;
}
{
E Tq, Tr, TV, TW;
Tq = cr[WS(rs, 3)];
Tr = ci[WS(rs, 4)];
Ts = Tq + Tr;
T2n = Tq - Tr;
TV = ci[WS(rs, 8)];
TW = cr[WS(rs, 15)];
TX = TV - TW;
T2o = TV + TW;
}
Tt = Tp + Ts;
T1U = TX + T10;
TU = Tp - Ts;
T11 = TX - T10;
T12 = TU + T11;
T1i = T11 - TU;
{
E T2p, T2s, T3c, T3d;
T2p = T2n - T2o;
T2s = T2q - T2r;
T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
T3c = T2q + T2r;
T3d = T2n + T2o;
T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
}
}
{
E Tf, Tu, T1O, T1S, T1V, T1W;
Tf = T7 + Te;
Tu = Tm + Tt;
T1O = Tf - Tu;
T1S = T1Q + T1R;
T1V = T1T + T1U;
T1W = T1S - T1V;
cr[0] = Tf + Tu;
ci[0] = T1S + T1V;
cr[WS(rs, 8)] = FNMS(T1P, T1W, T1N * T1O);
ci[WS(rs, 8)] = FMA(T1P, T1O, T1N * T1W);
}
{
E T3g, T3r, T3q, T3s;
{
E T38, T3f, T3m, T3p;
T38 = T36 - T37;
T3f = T3b + T3e;
T3g = T38 - T3f;
T3r = T38 + T3f;
T3m = T3k + T3l;
T3p = T3n - T3o;
T3q = T3m - T3p;
T3s = T3m + T3p;
}
cr[WS(rs, 11)] = FNMS(T3j, T3q, T35 * T3g);
ci[WS(rs, 11)] = FMA(T3j, T3g, T35 * T3q);
cr[WS(rs, 3)] = FNMS(T1n, T3s, T1l * T3r);
ci[WS(rs, 3)] = FMA(T1n, T3r, T1l * T3s);
}
{
E T3w, T3B, T3A, T3C;
{
E T3u, T3v, T3y, T3z;
T3u = T36 + T37;
T3v = T3n + T3o;
T3w = T3u - T3v;
T3B = T3u + T3v;
T3y = T3k - T3l;
T3z = T3b - T3e;
T3A = T3y + T3z;
T3C = T3y - T3z;
}
cr[WS(rs, 7)] = FNMS(T3x, T3A, T3t * T3w);
ci[WS(rs, 7)] = FMA(T3t, T3A, T3x * T3w);
cr[WS(rs, 15)] = FNMS(T1w, T3C, T1v * T3B);
ci[WS(rs, 15)] = FMA(T1v, T3C, T1w * T3B);
}
{
E T14, T1q, T1k, T1u;
{
E TK, T13, T1g, T1j;
TK = TC + TJ;
T13 = KP707106781 * (TT + T12);
T14 = TK - T13;
T1q = TK + T13;
T1g = T18 + T1f;
T1j = KP707106781 * (T1h + T1i);
T1k = T1g - T1j;
T1u = T1g + T1j;
}
cr[WS(rs, 10)] = FNMS(T17, T1k, TB * T14);
ci[WS(rs, 10)] = FMA(T17, T14, TB * T1k);
cr[WS(rs, 2)] = FNMS(T1t, T1u, T1p * T1q);
ci[WS(rs, 2)] = FMA(T1t, T1q, T1p * T1u);
}
{
E T1A, T1I, T1E, T1M;
{
E T1y, T1z, T1C, T1D;
T1y = TC - TJ;
T1z = KP707106781 * (T1i - T1h);
T1A = T1y - T1z;
T1I = T1y + T1z;
T1C = T1f - T18;
T1D = KP707106781 * (TT - T12);
T1E = T1C - T1D;
T1M = T1C + T1D;
}
cr[WS(rs, 14)] = FNMS(T1B, T1E, T1x * T1A);
ci[WS(rs, 14)] = FMA(T1x, T1E, T1B * T1A);
cr[WS(rs, 6)] = FNMS(T1L, T1M, T1H * T1I);
ci[WS(rs, 6)] = FMA(T1H, T1M, T1L * T1I);
}
{
E T2C, T2S, T2Q, T2U;
{
E T2m, T2B, T2M, T2P;
T2m = T2e - T2l;
T2B = T2t - T2A;
T2C = T2m - T2B;
T2S = T2m + T2B;
T2M = T2I - T2L;
T2P = T2N - T2O;
T2Q = T2M - T2P;
T2U = T2M + T2P;
}
cr[WS(rs, 13)] = FNMS(T2F, T2Q, T2b * T2C);
ci[WS(rs, 13)] = FMA(T2F, T2C, T2b * T2Q);
cr[WS(rs, 5)] = FNMS(T2T, T2U, T2R * T2S);
ci[WS(rs, 5)] = FMA(T2T, T2S, T2R * T2U);
}
{
E T2X, T31, T30, T32;
{
E T2V, T2W, T2Y, T2Z;
T2V = T2e + T2l;
T2W = T2N + T2O;
T2X = T2V - T2W;
T31 = T2V + T2W;
T2Y = T2I + T2L;
T2Z = T2A + T2t;
T30 = T2Y - T2Z;
T32 = T2Y + T2Z;
}
cr[WS(rs, 9)] = FNMS(Tz, T30, Tw * T2X);
ci[WS(rs, 9)] = FMA(Tw, T30, Tz * T2X);
cr[WS(rs, 1)] = FNMS(Ty, T32, Tv * T31);
ci[WS(rs, 1)] = FMA(Tv, T32, Ty * T31);
}
{
E T20, T26, T24, T28;
{
E T1Y, T1Z, T22, T23;
T1Y = T7 - Te;
T1Z = T1U - T1T;
T20 = T1Y - T1Z;
T26 = T1Y + T1Z;
T22 = T1Q - T1R;
T23 = Tm - Tt;
T24 = T22 - T23;
T28 = T23 + T22;
}
cr[WS(rs, 12)] = FNMS(T21, T24, T1X * T20);
ci[WS(rs, 12)] = FMA(T1X, T24, T21 * T20);
cr[WS(rs, 4)] = FNMS(T27, T28, T25 * T26);
ci[WS(rs, 4)] = FMA(T25, T28, T27 * T26);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
void X(codelet_hb2_16) (planner *p) {
X(khc2hc_register) (p, hb2_16, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,194 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:55 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hb2_4 -include rdft/scalar/hb.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 33 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T7, Tb, T8, Ta, Tc, Tg, T9, Tf;
T7 = W[0];
Tb = W[3];
T8 = W[2];
T9 = T7 * T8;
Tf = T7 * Tb;
Ta = W[1];
Tc = FMA(Ta, Tb, T9);
Tg = FNMS(Ta, T8, Tf);
{
E T3, T6, Td, Tj, Tz, Tx, Tr, Tm, Tv, Ts, Tw, TA;
{
E Th, Ti, Tu, Tk, Tl, Tq, Tp, Tt;
Th = ci[WS(rs, 3)];
Ti = cr[WS(rs, 2)];
Tu = Th + Ti;
Tk = ci[WS(rs, 2)];
Tl = cr[WS(rs, 3)];
Tq = Tk + Tl;
{
E T1, T2, T4, T5;
T1 = cr[0];
T2 = ci[WS(rs, 1)];
T3 = T1 + T2;
Tp = T1 - T2;
T4 = cr[WS(rs, 1)];
T5 = ci[0];
T6 = T4 + T5;
Tt = T4 - T5;
}
Td = T3 - T6;
Tj = Th - Ti;
Tz = Tu - Tt;
Tx = Tp + Tq;
Tr = Tp - Tq;
Tm = Tk - Tl;
Tv = Tt + Tu;
}
cr[0] = T3 + T6;
ci[0] = Tj + Tm;
Ts = T7 * Tr;
cr[WS(rs, 1)] = FNMS(Ta, Tv, Ts);
Tw = T7 * Tv;
ci[WS(rs, 1)] = FMA(Ta, Tr, Tw);
TA = T8 * Tz;
ci[WS(rs, 3)] = FMA(Tb, Tx, TA);
{
E Ty, Te, To, Tn;
Ty = T8 * Tx;
cr[WS(rs, 3)] = FNMS(Tb, Tz, Ty);
Te = Tc * Td;
To = Tg * Td;
Tn = Tj - Tm;
cr[WS(rs, 2)] = FNMS(Tg, Tn, Te);
ci[WS(rs, 2)] = FMA(Tc, Tn, To);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hb2_4) (planner *p) {
X(khc2hc_register) (p, hb2_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hb2_4 -include rdft/scalar/hb.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T7, T9, T8, Ta, Tb, Td;
T7 = W[0];
T9 = W[1];
T8 = W[2];
Ta = W[3];
Tb = FMA(T7, T8, T9 * Ta);
Td = FNMS(T9, T8, T7 * Ta);
{
E T3, Tl, T6, To, Tg, Tp, Tj, Tm, Tc, Tk;
{
E T1, T2, T4, T5;
T1 = cr[0];
T2 = ci[WS(rs, 1)];
T3 = T1 + T2;
Tl = T1 - T2;
T4 = cr[WS(rs, 1)];
T5 = ci[0];
T6 = T4 + T5;
To = T4 - T5;
}
{
E Te, Tf, Th, Ti;
Te = ci[WS(rs, 3)];
Tf = cr[WS(rs, 2)];
Tg = Te - Tf;
Tp = Te + Tf;
Th = ci[WS(rs, 2)];
Ti = cr[WS(rs, 3)];
Tj = Th - Ti;
Tm = Th + Ti;
}
cr[0] = T3 + T6;
ci[0] = Tg + Tj;
Tc = T3 - T6;
Tk = Tg - Tj;
cr[WS(rs, 2)] = FNMS(Td, Tk, Tb * Tc);
ci[WS(rs, 2)] = FMA(Td, Tc, Tb * Tk);
{
E Tn, Tq, Tr, Ts;
Tn = Tl - Tm;
Tq = To + Tp;
cr[WS(rs, 1)] = FNMS(T9, Tq, T7 * Tn);
ci[WS(rs, 1)] = FMA(T7, Tq, T9 * Tn);
Tr = Tl + Tm;
Ts = Tp - To;
cr[WS(rs, 3)] = FNMS(Ta, Ts, T8 * Tr);
ci[WS(rs, 3)] = FMA(T8, Ts, Ta * Tr);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hb2_4) (planner *p) {
X(khc2hc_register) (p, hb2_4, &desc);
}
#endif

View File

@@ -0,0 +1,279 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:57 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 5 -dif -name hb2_5 -include rdft/scalar/hb.h */
/*
* This function contains 44 FP additions, 40 FP multiplications,
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
* 37 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T9, TB, Tz, Tm, TC, TO, TG, TJ, TA, TF;
T9 = W[0];
TB = W[3];
Tz = W[2];
TA = T9 * Tz;
TF = T9 * TB;
Tm = W[1];
TC = FNMS(Tm, TB, TA);
TO = FNMS(Tm, Tz, TF);
TG = FMA(Tm, Tz, TF);
TJ = FMA(Tm, TB, TA);
{
E T1, Tb, TQ, Tw, T8, Ta, Tn, Tj, TL, Ts, Tq, Tr;
{
E T4, Tu, T7, Tv;
T1 = cr[0];
{
E T2, T3, T5, T6;
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
Tu = T2 - T3;
T5 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = T5 + T6;
Tv = T5 - T6;
}
Tb = T4 - T7;
TQ = FNMS(KP618033988, Tu, Tv);
Tw = FMA(KP618033988, Tv, Tu);
T8 = T4 + T7;
Ta = FNMS(KP250000000, T8, T1);
}
{
E Tf, To, Ti, Tp;
Tn = ci[WS(rs, 4)];
{
E Td, Te, Tg, Th;
Td = ci[WS(rs, 3)];
Te = cr[WS(rs, 4)];
Tf = Td + Te;
To = Td - Te;
Tg = ci[WS(rs, 2)];
Th = cr[WS(rs, 3)];
Ti = Tg + Th;
Tp = Tg - Th;
}
Tj = FMA(KP618033988, Ti, Tf);
TL = FNMS(KP618033988, Tf, Ti);
Ts = To - Tp;
Tq = To + Tp;
Tr = FNMS(KP250000000, Tq, Tn);
}
cr[0] = T1 + T8;
ci[0] = Tn + Tq;
{
E Tk, TD, Tx, TH, Tc, Tt;
Tc = FMA(KP559016994, Tb, Ta);
Tk = FNMS(KP951056516, Tj, Tc);
TD = FMA(KP951056516, Tj, Tc);
Tt = FMA(KP559016994, Ts, Tr);
Tx = FMA(KP951056516, Tw, Tt);
TH = FNMS(KP951056516, Tw, Tt);
{
E Tl, Ty, TE, TI;
Tl = T9 * Tk;
cr[WS(rs, 1)] = FNMS(Tm, Tx, Tl);
Ty = Tm * Tk;
ci[WS(rs, 1)] = FMA(T9, Tx, Ty);
TE = TC * TD;
cr[WS(rs, 4)] = FNMS(TG, TH, TE);
TI = TG * TD;
ci[WS(rs, 4)] = FMA(TC, TH, TI);
}
}
{
E TM, TT, TR, TV, TK, TP;
TK = FNMS(KP559016994, Tb, Ta);
TM = FMA(KP951056516, TL, TK);
TT = FNMS(KP951056516, TL, TK);
TP = FNMS(KP559016994, Ts, Tr);
TR = FNMS(KP951056516, TQ, TP);
TV = FMA(KP951056516, TQ, TP);
{
E TN, TS, TU, TW;
TN = TJ * TM;
cr[WS(rs, 2)] = FNMS(TO, TR, TN);
TS = TO * TM;
ci[WS(rs, 2)] = FMA(TJ, TR, TS);
TU = Tz * TT;
cr[WS(rs, 3)] = FNMS(TB, TV, TU);
TW = TB * TT;
ci[WS(rs, 3)] = FMA(Tz, TV, TW);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hb2_5", twinstr, &GENUS, { 14, 10, 30, 0 } };
void X(codelet_hb2_5) (planner *p) {
X(khc2hc_register) (p, hb2_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 5 -dif -name hb2_5 -include rdft/scalar/hb.h */
/*
* This function contains 44 FP additions, 32 FP multiplications,
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
* 33 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E Th, Tk, Ti, Tl, Tn, TP, Tx, TN;
{
E Tj, Tw, Tm, Tv;
Th = W[0];
Tk = W[1];
Ti = W[2];
Tl = W[3];
Tj = Th * Ti;
Tw = Tk * Ti;
Tm = Tk * Tl;
Tv = Th * Tl;
Tn = Tj + Tm;
TP = Tv + Tw;
Tx = Tv - Tw;
TN = Tj - Tm;
}
{
E T1, Tp, TK, TA, T8, To, T9, Tt, TI, TC, Tg, TB;
{
E T4, Ty, T7, Tz;
T1 = cr[0];
{
E T2, T3, T5, T6;
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
Ty = T2 - T3;
T5 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = T5 + T6;
Tz = T5 - T6;
}
Tp = KP559016994 * (T4 - T7);
TK = FMA(KP951056516, Ty, KP587785252 * Tz);
TA = FNMS(KP951056516, Tz, KP587785252 * Ty);
T8 = T4 + T7;
To = FNMS(KP250000000, T8, T1);
}
{
E Tc, Tr, Tf, Ts;
T9 = ci[WS(rs, 4)];
{
E Ta, Tb, Td, Te;
Ta = ci[WS(rs, 3)];
Tb = cr[WS(rs, 4)];
Tc = Ta - Tb;
Tr = Ta + Tb;
Td = ci[WS(rs, 2)];
Te = cr[WS(rs, 3)];
Tf = Td - Te;
Ts = Td + Te;
}
Tt = FNMS(KP951056516, Ts, KP587785252 * Tr);
TI = FMA(KP951056516, Tr, KP587785252 * Ts);
TC = KP559016994 * (Tc - Tf);
Tg = Tc + Tf;
TB = FNMS(KP250000000, Tg, T9);
}
cr[0] = T1 + T8;
ci[0] = T9 + Tg;
{
E Tu, TF, TE, TG, Tq, TD;
Tq = To - Tp;
Tu = Tq - Tt;
TF = Tq + Tt;
TD = TB - TC;
TE = TA + TD;
TG = TD - TA;
cr[WS(rs, 2)] = FNMS(Tx, TE, Tn * Tu);
ci[WS(rs, 2)] = FMA(Tn, TE, Tx * Tu);
cr[WS(rs, 3)] = FNMS(Tl, TG, Ti * TF);
ci[WS(rs, 3)] = FMA(Ti, TG, Tl * TF);
}
{
E TJ, TO, TM, TQ, TH, TL;
TH = Tp + To;
TJ = TH - TI;
TO = TH + TI;
TL = TC + TB;
TM = TK + TL;
TQ = TL - TK;
cr[WS(rs, 1)] = FNMS(Tk, TM, Th * TJ);
ci[WS(rs, 1)] = FMA(Th, TM, Tk * TJ);
cr[WS(rs, 4)] = FNMS(TP, TQ, TN * TO);
ci[WS(rs, 4)] = FMA(TN, TQ, TP * TO);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hb2_5", twinstr, &GENUS, { 30, 18, 14, 0 } };
void X(codelet_hb2_5) (planner *p) {
X(khc2hc_register) (p, hb2_5, &desc);
}
#endif

View File

@@ -0,0 +1,387 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:55 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include rdft/scalar/hb.h */
/*
* This function contains 74 FP additions, 50 FP multiplications,
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
* 47 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E Tf, Tg, Tl, Tp, Ti, Tj, Tk, T1b, T1u, T1e, T1o, To, Tq, TK;
{
E Th, T1n, T1t, Tn, Tm, TJ;
Tf = W[0];
Tg = W[2];
Th = Tf * Tg;
Tl = W[4];
T1n = Tf * Tl;
Tp = W[5];
T1t = Tf * Tp;
Ti = W[1];
Tj = W[3];
Tn = Tf * Tj;
Tk = FMA(Ti, Tj, Th);
T1b = FNMS(Ti, Tj, Th);
T1u = FNMS(Ti, Tl, T1t);
T1e = FMA(Ti, Tg, Tn);
T1o = FMA(Ti, Tp, T1n);
Tm = Tk * Tl;
TJ = Tk * Tp;
To = FNMS(Ti, Tg, Tn);
Tq = FMA(To, Tp, Tm);
TK = FNMS(To, Tl, TJ);
}
{
E T7, T1p, T1v, Tv, TP, T13, T1h, TZ, Te, T1k, T1w, T1q, TQ, TR, T10;
E TG, T14;
{
E T3, Tr, TO, T1f, T6, TL, Tu, T1g;
{
E T1, T2, TM, TN;
T1 = cr[0];
T2 = ci[WS(rs, 3)];
T3 = T1 + T2;
Tr = T1 - T2;
TM = ci[WS(rs, 7)];
TN = cr[WS(rs, 4)];
TO = TM + TN;
T1f = TM - TN;
}
{
E T4, T5, Ts, Tt;
T4 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 1)];
T6 = T4 + T5;
TL = T4 - T5;
Ts = ci[WS(rs, 5)];
Tt = cr[WS(rs, 6)];
Tu = Ts + Tt;
T1g = Ts - Tt;
}
T7 = T3 + T6;
T1p = T3 - T6;
T1v = T1f - T1g;
Tv = Tr - Tu;
TP = TL + TO;
T13 = TO - TL;
T1h = T1f + T1g;
TZ = Tr + Tu;
}
{
E Ta, Tw, TE, T1j, Td, TB, Tz, T1i, TA, TF;
{
E T8, T9, TC, TD;
T8 = cr[WS(rs, 1)];
T9 = ci[WS(rs, 2)];
Ta = T8 + T9;
Tw = T8 - T9;
TC = ci[WS(rs, 4)];
TD = cr[WS(rs, 7)];
TE = TC + TD;
T1j = TC - TD;
}
{
E Tb, Tc, Tx, Ty;
Tb = ci[0];
Tc = cr[WS(rs, 3)];
Td = Tb + Tc;
TB = Tb - Tc;
Tx = ci[WS(rs, 6)];
Ty = cr[WS(rs, 5)];
Tz = Tx + Ty;
T1i = Tx - Ty;
}
Te = Ta + Td;
T1k = T1i + T1j;
T1w = Ta - Td;
T1q = T1j - T1i;
TQ = Tw + Tz;
TR = TB + TE;
T10 = TQ + TR;
TA = Tw - Tz;
TF = TB - TE;
TG = TA + TF;
T14 = TA - TF;
}
cr[0] = T7 + Te;
ci[0] = T1h + T1k;
{
E T11, T12, T15, T16;
T11 = FNMS(KP707106781, T10, TZ);
T12 = Tg * T11;
T15 = FMA(KP707106781, T14, T13);
T16 = Tg * T15;
cr[WS(rs, 3)] = FNMS(Tj, T15, T12);
ci[WS(rs, 3)] = FMA(Tj, T11, T16);
}
{
E T1z, T1A, T1B, T1C;
T1z = T1p + T1q;
T1A = Tk * T1z;
T1B = T1w + T1v;
T1C = Tk * T1B;
cr[WS(rs, 2)] = FNMS(To, T1B, T1A);
ci[WS(rs, 2)] = FMA(To, T1z, T1C);
}
{
E T17, T18, T19, T1a;
T17 = FMA(KP707106781, T10, TZ);
T18 = Tl * T17;
T19 = FNMS(KP707106781, T14, T13);
T1a = Tl * T19;
cr[WS(rs, 7)] = FNMS(Tp, T19, T18);
ci[WS(rs, 7)] = FMA(Tp, T17, T1a);
}
{
E T1l, T1d, T1m, T1c;
T1l = T1h - T1k;
T1c = T7 - Te;
T1d = T1b * T1c;
T1m = T1e * T1c;
cr[WS(rs, 4)] = FNMS(T1e, T1l, T1d);
ci[WS(rs, 4)] = FMA(T1b, T1l, T1m);
}
{
E T1r, T1s, T1x, T1y;
T1r = T1p - T1q;
T1s = T1o * T1r;
T1x = T1v - T1w;
T1y = T1o * T1x;
cr[WS(rs, 6)] = FNMS(T1u, T1x, T1s);
ci[WS(rs, 6)] = FMA(T1u, T1r, T1y);
}
{
E TT, TX, TW, TY, TI, TU, TS, TV, TH;
TS = TQ - TR;
TT = FNMS(KP707106781, TS, TP);
TX = FMA(KP707106781, TS, TP);
TV = FMA(KP707106781, TG, Tv);
TW = Tf * TV;
TY = Ti * TV;
TH = FNMS(KP707106781, TG, Tv);
TI = Tq * TH;
TU = TK * TH;
cr[WS(rs, 5)] = FNMS(TK, TT, TI);
ci[WS(rs, 5)] = FMA(Tq, TT, TU);
cr[WS(rs, 1)] = FNMS(Ti, TX, TW);
ci[WS(rs, 1)] = FMA(Tf, TX, TY);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
void X(codelet_hb2_8) (planner *p) {
X(khc2hc_register) (p, hb2_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include rdft/scalar/hb.h */
/*
* This function contains 74 FP additions, 44 FP multiplications,
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
* 46 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
{
E Th, To, Tk, Tn;
Tf = W[0];
Ti = W[1];
Tg = W[2];
Tj = W[3];
Th = Tf * Tg;
To = Ti * Tg;
Tk = Ti * Tj;
Tn = Tf * Tj;
Tl = Th - Tk;
Tp = Tn + To;
TP = Th + Tk;
TR = Tn - To;
TF = W[4];
TG = W[5];
TH = FMA(Tf, TF, Ti * TG);
T15 = FNMS(TR, TF, TP * TG);
TL = FNMS(Ti, TF, Tf * TG);
TT = FMA(TP, TF, TR * TG);
}
{
E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
E TN, Tm, TE;
{
E T3, TU, Tv, TV, T6, T16, Ts, T17;
{
E T1, T2, Tt, Tu;
T1 = cr[0];
T2 = ci[WS(rs, 3)];
T3 = T1 + T2;
TU = T1 - T2;
Tt = ci[WS(rs, 5)];
Tu = cr[WS(rs, 6)];
Tv = Tt - Tu;
TV = Tt + Tu;
}
{
E T4, T5, Tq, Tr;
T4 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 1)];
T6 = T4 + T5;
T16 = T4 - T5;
Tq = ci[WS(rs, 7)];
Tr = cr[WS(rs, 4)];
Ts = Tq - Tr;
T17 = Tq + Tr;
}
T7 = T3 + T6;
T1f = TU + TV;
T1i = T17 - T16;
Tw = Ts + Tv;
TI = T3 - T6;
TW = TU - TV;
T18 = T16 + T17;
TM = Ts - Tv;
}
{
E Ta, TX, TC, T11, Td, T10, Tz, TY;
{
E T8, T9, TA, TB;
T8 = cr[WS(rs, 1)];
T9 = ci[WS(rs, 2)];
Ta = T8 + T9;
TX = T8 - T9;
TA = ci[WS(rs, 4)];
TB = cr[WS(rs, 7)];
TC = TA - TB;
T11 = TA + TB;
}
{
E Tb, Tc, Tx, Ty;
Tb = ci[0];
Tc = cr[WS(rs, 3)];
Td = Tb + Tc;
T10 = Tb - Tc;
Tx = ci[WS(rs, 6)];
Ty = cr[WS(rs, 5)];
Tz = Tx - Ty;
TY = Tx + Ty;
}
Te = Ta + Td;
T19 = TX + TY;
T1a = T10 + T11;
TD = Tz + TC;
TJ = TC - Tz;
TZ = TX - TY;
T12 = T10 - T11;
TN = Ta - Td;
}
cr[0] = T7 + Te;
ci[0] = Tw + TD;
Tm = T7 - Te;
TE = Tw - TD;
cr[WS(rs, 4)] = FNMS(Tp, TE, Tl * Tm);
ci[WS(rs, 4)] = FMA(Tp, Tm, Tl * TE);
{
E TQ, TS, TK, TO;
TQ = TI + TJ;
TS = TN + TM;
cr[WS(rs, 2)] = FNMS(TR, TS, TP * TQ);
ci[WS(rs, 2)] = FMA(TP, TS, TR * TQ);
TK = TI - TJ;
TO = TM - TN;
cr[WS(rs, 6)] = FNMS(TL, TO, TH * TK);
ci[WS(rs, 6)] = FMA(TH, TO, TL * TK);
}
{
E T1h, T1l, T1k, T1m, T1g, T1j;
T1g = KP707106781 * (T19 + T1a);
T1h = T1f - T1g;
T1l = T1f + T1g;
T1j = KP707106781 * (TZ - T12);
T1k = T1i + T1j;
T1m = T1i - T1j;
cr[WS(rs, 3)] = FNMS(Tj, T1k, Tg * T1h);
ci[WS(rs, 3)] = FMA(Tg, T1k, Tj * T1h);
cr[WS(rs, 7)] = FNMS(TG, T1m, TF * T1l);
ci[WS(rs, 7)] = FMA(TF, T1m, TG * T1l);
}
{
E T14, T1d, T1c, T1e, T13, T1b;
T13 = KP707106781 * (TZ + T12);
T14 = TW - T13;
T1d = TW + T13;
T1b = KP707106781 * (T19 - T1a);
T1c = T18 - T1b;
T1e = T18 + T1b;
cr[WS(rs, 5)] = FNMS(T15, T1c, TT * T14);
ci[WS(rs, 5)] = FMA(T15, T14, TT * T1c);
cr[WS(rs, 1)] = FNMS(Ti, T1e, Tf * T1d);
ci[WS(rs, 1)] = FMA(Ti, T1d, Tf * T1e);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
void X(codelet_hb2_8) (planner *p) {
X(khc2hc_register) (p, hb2_8, &desc);
}
#endif

View File

@@ -0,0 +1,513 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hb_10 -include rdft/scalar/hb.h */
/*
* This function contains 102 FP additions, 72 FP multiplications,
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
* 47 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E TH, T1B, TB, T11, T1E, T1G, TK, TM, T1x, T1V, T3, T1g, Tl, T1I, T1J;
E TO, TP, T1p, Ti, Tk, T1n, T1o, TF, TG;
TF = ci[WS(rs, 9)];
TG = cr[WS(rs, 5)];
TH = TF - TG;
T1B = TF + TG;
{
E Tp, T1u, Tz, T1s, Ts, T1v, Tw, T1r;
{
E Tn, To, Tx, Ty;
Tn = ci[WS(rs, 5)];
To = cr[WS(rs, 9)];
Tp = Tn - To;
T1u = Tn + To;
Tx = ci[WS(rs, 6)];
Ty = cr[WS(rs, 8)];
Tz = Tx - Ty;
T1s = Tx + Ty;
}
{
E Tq, Tr, Tu, Tv;
Tq = ci[WS(rs, 8)];
Tr = cr[WS(rs, 6)];
Ts = Tq - Tr;
T1v = Tq + Tr;
Tu = ci[WS(rs, 7)];
Tv = cr[WS(rs, 7)];
Tw = Tu - Tv;
T1r = Tu + Tv;
}
{
E Tt, TA, T1C, T1D;
Tt = Tp - Ts;
TA = Tw - Tz;
TB = FNMS(KP618033988, TA, Tt);
T11 = FMA(KP618033988, Tt, TA);
T1C = T1r - T1s;
T1D = T1u - T1v;
T1E = T1C + T1D;
T1G = T1C - T1D;
}
{
E TI, TJ, T1t, T1w;
TI = Tw + Tz;
TJ = Tp + Ts;
TK = TI + TJ;
TM = TI - TJ;
T1t = T1r + T1s;
T1w = T1u + T1v;
T1x = FMA(KP618033988, T1w, T1t);
T1V = FNMS(KP618033988, T1t, T1w);
}
}
{
E Td, T1k, Tg, T1l, Th, T1m, T6, T1h, T9, T1i, Ta, T1j, T1, T2;
T1 = cr[0];
T2 = ci[WS(rs, 4)];
T3 = T1 + T2;
T1g = T1 - T2;
{
E Tb, Tc, Te, Tf;
Tb = cr[WS(rs, 4)];
Tc = ci[0];
Td = Tb + Tc;
T1k = Tb - Tc;
Te = ci[WS(rs, 3)];
Tf = cr[WS(rs, 1)];
Tg = Te + Tf;
T1l = Te - Tf;
}
Th = Td + Tg;
T1m = T1k + T1l;
{
E T4, T5, T7, T8;
T4 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 2)];
T6 = T4 + T5;
T1h = T4 - T5;
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 3)];
T9 = T7 + T8;
T1i = T7 - T8;
}
Ta = T6 + T9;
T1j = T1h + T1i;
Tl = Ta - Th;
T1I = T1h - T1i;
T1J = T1k - T1l;
TO = Td - Tg;
TP = T6 - T9;
T1p = T1j - T1m;
Ti = Ta + Th;
Tk = FNMS(KP250000000, Ti, T3);
T1n = T1j + T1m;
T1o = FNMS(KP250000000, T1n, T1g);
}
cr[0] = T3 + Ti;
ci[0] = TH + TK;
{
E T2d, T29, T2b, T2c, T2e, T2a;
T2d = T1B + T1E;
T2a = T1g + T1n;
T29 = W[8];
T2b = T29 * T2a;
T2c = W[9];
T2e = T2c * T2a;
cr[WS(rs, 5)] = FNMS(T2c, T2d, T2b);
ci[WS(rs, 5)] = FMA(T29, T2d, T2e);
}
{
E TQ, T16, TC, TU, TN, T15, T12, T1a, Tm, TL, T10;
TQ = FNMS(KP618033988, TP, TO);
T16 = FMA(KP618033988, TO, TP);
Tm = FNMS(KP559016994, Tl, Tk);
TC = FMA(KP951056516, TB, Tm);
TU = FNMS(KP951056516, TB, Tm);
TL = FNMS(KP250000000, TK, TH);
TN = FNMS(KP559016994, TM, TL);
T15 = FMA(KP559016994, TM, TL);
T10 = FMA(KP559016994, Tl, Tk);
T12 = FMA(KP951056516, T11, T10);
T1a = FNMS(KP951056516, T11, T10);
{
E TR, TE, TS, Tj, TD;
TR = FNMS(KP951056516, TQ, TN);
TE = W[3];
TS = TE * TC;
Tj = W[2];
TD = Tj * TC;
cr[WS(rs, 2)] = FNMS(TE, TR, TD);
ci[WS(rs, 2)] = FMA(Tj, TR, TS);
}
{
E T1d, T1c, T1e, T19, T1b;
T1d = FMA(KP951056516, T16, T15);
T1c = W[11];
T1e = T1c * T1a;
T19 = W[10];
T1b = T19 * T1a;
cr[WS(rs, 6)] = FNMS(T1c, T1d, T1b);
ci[WS(rs, 6)] = FMA(T19, T1d, T1e);
}
{
E TX, TW, TY, TT, TV;
TX = FMA(KP951056516, TQ, TN);
TW = W[15];
TY = TW * TU;
TT = W[14];
TV = TT * TU;
cr[WS(rs, 8)] = FNMS(TW, TX, TV);
ci[WS(rs, 8)] = FMA(TT, TX, TY);
}
{
E T17, T14, T18, TZ, T13;
T17 = FNMS(KP951056516, T16, T15);
T14 = W[7];
T18 = T14 * T12;
TZ = W[6];
T13 = TZ * T12;
cr[WS(rs, 4)] = FNMS(T14, T17, T13);
ci[WS(rs, 4)] = FMA(TZ, T17, T18);
}
}
{
E T1K, T20, T1y, T1O, T1H, T1Z, T1W, T24, T1q, T1F, T1U;
T1K = FMA(KP618033988, T1J, T1I);
T20 = FNMS(KP618033988, T1I, T1J);
T1q = FMA(KP559016994, T1p, T1o);
T1y = FNMS(KP951056516, T1x, T1q);
T1O = FMA(KP951056516, T1x, T1q);
T1F = FNMS(KP250000000, T1E, T1B);
T1H = FMA(KP559016994, T1G, T1F);
T1Z = FNMS(KP559016994, T1G, T1F);
T1U = FNMS(KP559016994, T1p, T1o);
T1W = FNMS(KP951056516, T1V, T1U);
T24 = FMA(KP951056516, T1V, T1U);
{
E T1L, T1A, T1M, T1f, T1z;
T1L = FMA(KP951056516, T1K, T1H);
T1A = W[1];
T1M = T1A * T1y;
T1f = W[0];
T1z = T1f * T1y;
cr[WS(rs, 1)] = FNMS(T1A, T1L, T1z);
ci[WS(rs, 1)] = FMA(T1f, T1L, T1M);
}
{
E T27, T26, T28, T23, T25;
T27 = FNMS(KP951056516, T20, T1Z);
T26 = W[13];
T28 = T26 * T24;
T23 = W[12];
T25 = T23 * T24;
cr[WS(rs, 7)] = FNMS(T26, T27, T25);
ci[WS(rs, 7)] = FMA(T23, T27, T28);
}
{
E T1R, T1Q, T1S, T1N, T1P;
T1R = FNMS(KP951056516, T1K, T1H);
T1Q = W[17];
T1S = T1Q * T1O;
T1N = W[16];
T1P = T1N * T1O;
cr[WS(rs, 9)] = FNMS(T1Q, T1R, T1P);
ci[WS(rs, 9)] = FMA(T1N, T1R, T1S);
}
{
E T21, T1Y, T22, T1T, T1X;
T21 = FMA(KP951056516, T20, T1Z);
T1Y = W[5];
T22 = T1Y * T1W;
T1T = W[4];
T1X = T1T * T1W;
cr[WS(rs, 3)] = FNMS(T1Y, T21, T1X);
ci[WS(rs, 3)] = FMA(T1T, T21, T22);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 10, "hb_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
void X(codelet_hb_10) (planner *p) {
X(khc2hc_register) (p, hb_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hb_10 -include rdft/scalar/hb.h */
/*
* This function contains 102 FP additions, 60 FP multiplications,
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
* 41 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T3, T18, TE, TF, T1B, T1A, T1f, T1t, Ti, Tl, TJ, T1i, Tt, TA, T1w;
E T1v, T1p, T1E, TM, TO;
{
E T1, T2, TH, TI;
T1 = cr[0];
T2 = ci[WS(rs, 4)];
T3 = T1 + T2;
T18 = T1 - T2;
{
E T6, T19, Tg, T1d, T9, T1a, Td, T1c;
{
E T4, T5, Te, Tf;
T4 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 2)];
T6 = T4 + T5;
T19 = T4 - T5;
Te = ci[WS(rs, 3)];
Tf = cr[WS(rs, 1)];
Tg = Te + Tf;
T1d = Te - Tf;
}
{
E T7, T8, Tb, Tc;
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 3)];
T9 = T7 + T8;
T1a = T7 - T8;
Tb = cr[WS(rs, 4)];
Tc = ci[0];
Td = Tb + Tc;
T1c = Tb - Tc;
}
TE = T6 - T9;
TF = Td - Tg;
T1B = T1c - T1d;
T1A = T19 - T1a;
{
E T1b, T1e, Ta, Th;
T1b = T19 + T1a;
T1e = T1c + T1d;
T1f = T1b + T1e;
T1t = KP559016994 * (T1b - T1e);
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
Tl = KP559016994 * (Ta - Th);
}
}
TH = ci[WS(rs, 9)];
TI = cr[WS(rs, 5)];
TJ = TH - TI;
T1i = TH + TI;
{
E Tp, T1j, Tz, T1n, Ts, T1k, Tw, T1m;
{
E Tn, To, Tx, Ty;
Tn = ci[WS(rs, 7)];
To = cr[WS(rs, 7)];
Tp = Tn - To;
T1j = Tn + To;
Tx = ci[WS(rs, 8)];
Ty = cr[WS(rs, 6)];
Tz = Tx - Ty;
T1n = Tx + Ty;
}
{
E Tq, Tr, Tu, Tv;
Tq = ci[WS(rs, 6)];
Tr = cr[WS(rs, 8)];
Ts = Tq - Tr;
T1k = Tq + Tr;
Tu = ci[WS(rs, 5)];
Tv = cr[WS(rs, 9)];
Tw = Tu - Tv;
T1m = Tu + Tv;
}
Tt = Tp - Ts;
TA = Tw - Tz;
T1w = T1m + T1n;
T1v = T1j + T1k;
{
E T1l, T1o, TK, TL;
T1l = T1j - T1k;
T1o = T1m - T1n;
T1p = T1l + T1o;
T1E = KP559016994 * (T1l - T1o);
TK = Tp + Ts;
TL = Tw + Tz;
TM = TK + TL;
TO = KP559016994 * (TK - TL);
}
}
}
cr[0] = T3 + Ti;
ci[0] = TJ + TM;
{
E T1g, T1q, T17, T1h;
T1g = T18 + T1f;
T1q = T1i + T1p;
T17 = W[8];
T1h = W[9];
cr[WS(rs, 5)] = FNMS(T1h, T1q, T17 * T1g);
ci[WS(rs, 5)] = FMA(T1h, T1g, T17 * T1q);
}
{
E TB, TG, T11, TX, TP, T10, Tm, TW, TN, Tk;
TB = FNMS(KP951056516, TA, KP587785252 * Tt);
TG = FNMS(KP951056516, TF, KP587785252 * TE);
T11 = FMA(KP951056516, TE, KP587785252 * TF);
TX = FMA(KP951056516, Tt, KP587785252 * TA);
TN = FNMS(KP250000000, TM, TJ);
TP = TN - TO;
T10 = TO + TN;
Tk = FNMS(KP250000000, Ti, T3);
Tm = Tk - Tl;
TW = Tl + Tk;
{
E TC, TQ, Tj, TD;
TC = Tm - TB;
TQ = TG + TP;
Tj = W[2];
TD = W[3];
cr[WS(rs, 2)] = FNMS(TD, TQ, Tj * TC);
ci[WS(rs, 2)] = FMA(TD, TC, Tj * TQ);
}
{
E T14, T16, T13, T15;
T14 = TW - TX;
T16 = T11 + T10;
T13 = W[10];
T15 = W[11];
cr[WS(rs, 6)] = FNMS(T15, T16, T13 * T14);
ci[WS(rs, 6)] = FMA(T15, T14, T13 * T16);
}
{
E TS, TU, TR, TT;
TS = Tm + TB;
TU = TP - TG;
TR = W[14];
TT = W[15];
cr[WS(rs, 8)] = FNMS(TT, TU, TR * TS);
ci[WS(rs, 8)] = FMA(TT, TS, TR * TU);
}
{
E TY, T12, TV, TZ;
TY = TW + TX;
T12 = T10 - T11;
TV = W[6];
TZ = W[7];
cr[WS(rs, 4)] = FNMS(TZ, T12, TV * TY);
ci[WS(rs, 4)] = FMA(TZ, TY, TV * T12);
}
}
{
E T1x, T1C, T1Q, T1N, T1F, T1R, T1u, T1M, T1D, T1s;
T1x = FNMS(KP951056516, T1w, KP587785252 * T1v);
T1C = FNMS(KP951056516, T1B, KP587785252 * T1A);
T1Q = FMA(KP951056516, T1A, KP587785252 * T1B);
T1N = FMA(KP951056516, T1v, KP587785252 * T1w);
T1D = FNMS(KP250000000, T1p, T1i);
T1F = T1D - T1E;
T1R = T1E + T1D;
T1s = FNMS(KP250000000, T1f, T18);
T1u = T1s - T1t;
T1M = T1t + T1s;
{
E T1y, T1G, T1r, T1z;
T1y = T1u - T1x;
T1G = T1C + T1F;
T1r = W[12];
T1z = W[13];
cr[WS(rs, 7)] = FNMS(T1z, T1G, T1r * T1y);
ci[WS(rs, 7)] = FMA(T1r, T1G, T1z * T1y);
}
{
E T1U, T1W, T1T, T1V;
T1U = T1M + T1N;
T1W = T1R - T1Q;
T1T = W[16];
T1V = W[17];
cr[WS(rs, 9)] = FNMS(T1V, T1W, T1T * T1U);
ci[WS(rs, 9)] = FMA(T1T, T1W, T1V * T1U);
}
{
E T1I, T1K, T1H, T1J;
T1I = T1u + T1x;
T1K = T1F - T1C;
T1H = W[4];
T1J = W[5];
cr[WS(rs, 3)] = FNMS(T1J, T1K, T1H * T1I);
ci[WS(rs, 3)] = FMA(T1H, T1K, T1J * T1I);
}
{
E T1O, T1S, T1L, T1P;
T1O = T1M - T1N;
T1S = T1Q + T1R;
T1L = W[0];
T1P = W[1];
cr[WS(rs, 1)] = FNMS(T1P, T1S, T1L * T1O);
ci[WS(rs, 1)] = FMA(T1L, T1S, T1P * T1O);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 10, "hb_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
void X(codelet_hb_10) (planner *p) {
X(khc2hc_register) (p, hb_10, &desc);
}
#endif

View File

@@ -0,0 +1,597 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include rdft/scalar/hb.h */
/*
* This function contains 118 FP additions, 68 FP multiplications,
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T18, T20, T1b, T21, T1s, T2a, T1p, T29, TI, TN, TO, Tb, To, T1f, T23;
E T1i, T24, T1z, T2d, T1w, T2c, Tt, Ty, Tz, Tm, TD;
{
E T1, TE, TM, T6, T4, T1o, TH, T17, TL, T1a, T9, T1r;
T1 = cr[0];
TE = ci[WS(rs, 11)];
TM = cr[WS(rs, 6)];
T6 = ci[WS(rs, 5)];
{
E T2, T3, TF, TG;
T2 = cr[WS(rs, 4)];
T3 = ci[WS(rs, 3)];
T4 = T2 + T3;
T1o = T2 - T3;
TF = ci[WS(rs, 7)];
TG = cr[WS(rs, 8)];
TH = TF - TG;
T17 = TF + TG;
}
{
E TJ, TK, T7, T8;
TJ = ci[WS(rs, 9)];
TK = cr[WS(rs, 10)];
TL = TJ - TK;
T1a = TJ + TK;
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 2)];
T9 = T7 + T8;
T1r = T7 - T8;
}
{
E T16, T19, T1q, T1n, T5, Ta;
T16 = FNMS(KP500000000, T4, T1);
T18 = FNMS(KP866025403, T17, T16);
T20 = FMA(KP866025403, T17, T16);
T19 = FNMS(KP500000000, T9, T6);
T1b = FMA(KP866025403, T1a, T19);
T21 = FNMS(KP866025403, T1a, T19);
T1q = FMA(KP500000000, TL, TM);
T1s = FNMS(KP866025403, T1r, T1q);
T2a = FMA(KP866025403, T1r, T1q);
T1n = FNMS(KP500000000, TH, TE);
T1p = FMA(KP866025403, T1o, T1n);
T29 = FNMS(KP866025403, T1o, T1n);
TI = TE + TH;
TN = TL - TM;
TO = TI - TN;
T5 = T1 + T4;
Ta = T6 + T9;
Tb = T5 + Ta;
To = T5 - Ta;
}
}
{
E Tc, Tp, Tx, Th, Tf, T1v, Ts, T1e, Tw, T1h, Tk, T1y;
Tc = cr[WS(rs, 3)];
Tp = ci[WS(rs, 8)];
Tx = cr[WS(rs, 9)];
Th = ci[WS(rs, 2)];
{
E Td, Te, Tq, Tr;
Td = ci[WS(rs, 4)];
Te = ci[0];
Tf = Td + Te;
T1v = Td - Te;
Tq = cr[WS(rs, 7)];
Tr = cr[WS(rs, 11)];
Ts = Tq + Tr;
T1e = Tq - Tr;
}
{
E Tu, Tv, Ti, Tj;
Tu = ci[WS(rs, 10)];
Tv = ci[WS(rs, 6)];
Tw = Tu + Tv;
T1h = Tv - Tu;
Ti = cr[WS(rs, 1)];
Tj = cr[WS(rs, 5)];
Tk = Ti + Tj;
T1y = Ti - Tj;
}
{
E T1d, T1g, T1x, T1u, Tg, Tl;
T1d = FNMS(KP500000000, Tf, Tc);
T1f = FMA(KP866025403, T1e, T1d);
T23 = FNMS(KP866025403, T1e, T1d);
T1g = FNMS(KP500000000, Tk, Th);
T1i = FMA(KP866025403, T1h, T1g);
T24 = FNMS(KP866025403, T1h, T1g);
T1x = FMA(KP500000000, Tw, Tx);
T1z = FNMS(KP866025403, T1y, T1x);
T2d = FMA(KP866025403, T1y, T1x);
T1u = FMA(KP500000000, Ts, Tp);
T1w = FMA(KP866025403, T1v, T1u);
T2c = FNMS(KP866025403, T1v, T1u);
Tt = Tp - Ts;
Ty = Tw - Tx;
Tz = Tt - Ty;
Tg = Tc + Tf;
Tl = Th + Tk;
Tm = Tg + Tl;
TD = Tg - Tl;
}
}
cr[0] = Tb + Tm;
{
E TA, TP, TB, TQ, Tn, TC;
TA = To - Tz;
TP = TD + TO;
Tn = W[16];
TB = Tn * TA;
TQ = Tn * TP;
TC = W[17];
cr[WS(rs, 9)] = FNMS(TC, TP, TB);
ci[WS(rs, 9)] = FMA(TC, TA, TQ);
}
{
E TS, TV, TT, TW, TR, TU;
TS = To + Tz;
TV = TO - TD;
TR = W[4];
TT = TR * TS;
TW = TR * TV;
TU = W[5];
cr[WS(rs, 3)] = FNMS(TU, TV, TT);
ci[WS(rs, 3)] = FMA(TU, TS, TW);
}
{
E T11, T12, T13, TX, TZ, T10, T14, TY;
T11 = TI + TN;
T12 = Tt + Ty;
T13 = T11 - T12;
TY = Tb - Tm;
TX = W[10];
TZ = TX * TY;
T10 = W[11];
T14 = T10 * TY;
ci[0] = T11 + T12;
ci[WS(rs, 6)] = FMA(TX, T13, T14);
cr[WS(rs, 6)] = FNMS(T10, T13, TZ);
}
{
E T1k, T1E, T1B, T1H;
{
E T1c, T1j, T1t, T1A;
T1c = T18 + T1b;
T1j = T1f + T1i;
T1k = T1c - T1j;
T1E = T1c + T1j;
T1t = T1p - T1s;
T1A = T1w - T1z;
T1B = T1t - T1A;
T1H = T1t + T1A;
}
{
E T15, T1l, T1m, T1C;
T15 = W[18];
T1l = T15 * T1k;
T1m = W[19];
T1C = T1m * T1k;
cr[WS(rs, 10)] = FNMS(T1m, T1B, T1l);
ci[WS(rs, 10)] = FMA(T15, T1B, T1C);
}
{
E T1D, T1F, T1G, T1I;
T1D = W[6];
T1F = T1D * T1E;
T1G = W[7];
T1I = T1G * T1E;
cr[WS(rs, 4)] = FNMS(T1G, T1H, T1F);
ci[WS(rs, 4)] = FMA(T1D, T1H, T1I);
}
}
{
E T26, T2i, T2f, T2l;
{
E T22, T25, T2b, T2e;
T22 = T20 + T21;
T25 = T23 + T24;
T26 = T22 - T25;
T2i = T22 + T25;
T2b = T29 - T2a;
T2e = T2c - T2d;
T2f = T2b - T2e;
T2l = T2b + T2e;
}
{
E T1Z, T27, T28, T2g;
T1Z = W[2];
T27 = T1Z * T26;
T28 = W[3];
T2g = T28 * T26;
cr[WS(rs, 2)] = FNMS(T28, T2f, T27);
ci[WS(rs, 2)] = FMA(T1Z, T2f, T2g);
}
{
E T2h, T2j, T2k, T2m;
T2h = W[14];
T2j = T2h * T2i;
T2k = W[15];
T2m = T2k * T2i;
cr[WS(rs, 8)] = FNMS(T2k, T2l, T2j);
ci[WS(rs, 8)] = FMA(T2h, T2l, T2m);
}
}
{
E T2q, T2y, T2v, T2B;
{
E T2o, T2p, T2t, T2u;
T2o = T20 - T21;
T2p = T2c + T2d;
T2q = T2o - T2p;
T2y = T2o + T2p;
T2t = T29 + T2a;
T2u = T23 - T24;
T2v = T2t + T2u;
T2B = T2t - T2u;
}
{
E T2r, T2w, T2n, T2s;
T2n = W[8];
T2r = T2n * T2q;
T2w = T2n * T2v;
T2s = W[9];
cr[WS(rs, 5)] = FNMS(T2s, T2v, T2r);
ci[WS(rs, 5)] = FMA(T2s, T2q, T2w);
}
{
E T2z, T2C, T2x, T2A;
T2x = W[20];
T2z = T2x * T2y;
T2C = T2x * T2B;
T2A = W[21];
cr[WS(rs, 11)] = FNMS(T2A, T2B, T2z);
ci[WS(rs, 11)] = FMA(T2A, T2y, T2C);
}
}
{
E T1M, T1U, T1R, T1X;
{
E T1K, T1L, T1P, T1Q;
T1K = T18 - T1b;
T1L = T1w + T1z;
T1M = T1K - T1L;
T1U = T1K + T1L;
T1P = T1p + T1s;
T1Q = T1f - T1i;
T1R = T1P + T1Q;
T1X = T1P - T1Q;
}
{
E T1N, T1S, T1J, T1O;
T1J = W[0];
T1N = T1J * T1M;
T1S = T1J * T1R;
T1O = W[1];
cr[WS(rs, 1)] = FNMS(T1O, T1R, T1N);
ci[WS(rs, 1)] = FMA(T1O, T1M, T1S);
}
{
E T1V, T1Y, T1T, T1W;
T1T = W[12];
T1V = T1T * T1U;
T1Y = T1T * T1X;
T1W = W[13];
cr[WS(rs, 7)] = FNMS(T1W, T1X, T1V);
ci[WS(rs, 7)] = FMA(T1W, T1U, T1Y);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
void X(codelet_hb_12) (planner *p) {
X(khc2hc_register) (p, hb_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hb_12 -include rdft/scalar/hb.h */
/*
* This function contains 118 FP additions, 60 FP multiplications,
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
* 39 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T5, TH, T12, T1M, T1i, T1U, Tg, Tt, T19, T1X, T1p, T1P, Ta, TM, T15;
E T1N, T1l, T1V, Tl, Ty, T1c, T1Y, T1s, T1Q;
{
E T1, TD, T4, T1g, TG, T11, T10, T1h;
T1 = cr[0];
TD = ci[WS(rs, 11)];
{
E T2, T3, TE, TF;
T2 = cr[WS(rs, 4)];
T3 = ci[WS(rs, 3)];
T4 = T2 + T3;
T1g = KP866025403 * (T2 - T3);
TE = ci[WS(rs, 7)];
TF = cr[WS(rs, 8)];
TG = TE - TF;
T11 = KP866025403 * (TE + TF);
}
T5 = T1 + T4;
TH = TD + TG;
T10 = FNMS(KP500000000, T4, T1);
T12 = T10 - T11;
T1M = T10 + T11;
T1h = FNMS(KP500000000, TG, TD);
T1i = T1g + T1h;
T1U = T1h - T1g;
}
{
E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
Tc = cr[WS(rs, 3)];
Tp = ci[WS(rs, 8)];
{
E Td, Te, Tq, Tr;
Td = ci[WS(rs, 4)];
Te = ci[0];
Tf = Td + Te;
T17 = KP866025403 * (Td - Te);
Tq = cr[WS(rs, 7)];
Tr = cr[WS(rs, 11)];
Ts = Tq + Tr;
T1o = KP866025403 * (Tq - Tr);
}
Tg = Tc + Tf;
Tt = Tp - Ts;
T18 = FMA(KP500000000, Ts, Tp);
T19 = T17 + T18;
T1X = T18 - T17;
T1n = FNMS(KP500000000, Tf, Tc);
T1p = T1n + T1o;
T1P = T1n - T1o;
}
{
E T6, TL, T9, T1j, TK, T14, T13, T1k;
T6 = ci[WS(rs, 5)];
TL = cr[WS(rs, 6)];
{
E T7, T8, TI, TJ;
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 2)];
T9 = T7 + T8;
T1j = KP866025403 * (T7 - T8);
TI = ci[WS(rs, 9)];
TJ = cr[WS(rs, 10)];
TK = TI - TJ;
T14 = KP866025403 * (TI + TJ);
}
Ta = T6 + T9;
TM = TK - TL;
T13 = FNMS(KP500000000, T9, T6);
T15 = T13 + T14;
T1N = T13 - T14;
T1k = FMA(KP500000000, TK, TL);
T1l = T1j - T1k;
T1V = T1j + T1k;
}
{
E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
Th = ci[WS(rs, 2)];
Tx = cr[WS(rs, 9)];
{
E Ti, Tj, Tu, Tv;
Ti = cr[WS(rs, 1)];
Tj = cr[WS(rs, 5)];
Tk = Ti + Tj;
T1a = KP866025403 * (Ti - Tj);
Tu = ci[WS(rs, 10)];
Tv = ci[WS(rs, 6)];
Tw = Tu + Tv;
T1r = KP866025403 * (Tv - Tu);
}
Tl = Th + Tk;
Ty = Tw - Tx;
T1b = FMA(KP500000000, Tw, Tx);
T1c = T1a - T1b;
T1Y = T1a + T1b;
T1q = FNMS(KP500000000, Tk, Th);
T1s = T1q + T1r;
T1Q = T1q - T1r;
}
{
E Tb, Tm, TU, TW, TX, TY, TT, TV;
Tb = T5 + Ta;
Tm = Tg + Tl;
TU = Tb - Tm;
TW = TH + TM;
TX = Tt + Ty;
TY = TW - TX;
cr[0] = Tb + Tm;
ci[0] = TW + TX;
TT = W[10];
TV = W[11];
cr[WS(rs, 6)] = FNMS(TV, TY, TT * TU);
ci[WS(rs, 6)] = FMA(TV, TU, TT * TY);
}
{
E TA, TQ, TO, TS;
{
E To, Tz, TC, TN;
To = T5 - Ta;
Tz = Tt - Ty;
TA = To - Tz;
TQ = To + Tz;
TC = Tg - Tl;
TN = TH - TM;
TO = TC + TN;
TS = TN - TC;
}
{
E Tn, TB, TP, TR;
Tn = W[16];
TB = W[17];
cr[WS(rs, 9)] = FNMS(TB, TO, Tn * TA);
ci[WS(rs, 9)] = FMA(Tn, TO, TB * TA);
TP = W[4];
TR = W[5];
cr[WS(rs, 3)] = FNMS(TR, TS, TP * TQ);
ci[WS(rs, 3)] = FMA(TP, TS, TR * TQ);
}
}
{
E T28, T2e, T2c, T2g;
{
E T26, T27, T2a, T2b;
T26 = T1M - T1N;
T27 = T1X + T1Y;
T28 = T26 - T27;
T2e = T26 + T27;
T2a = T1U + T1V;
T2b = T1P - T1Q;
T2c = T2a + T2b;
T2g = T2a - T2b;
}
{
E T25, T29, T2d, T2f;
T25 = W[8];
T29 = W[9];
cr[WS(rs, 5)] = FNMS(T29, T2c, T25 * T28);
ci[WS(rs, 5)] = FMA(T25, T2c, T29 * T28);
T2d = W[20];
T2f = W[21];
cr[WS(rs, 11)] = FNMS(T2f, T2g, T2d * T2e);
ci[WS(rs, 11)] = FMA(T2d, T2g, T2f * T2e);
}
}
{
E T1S, T22, T20, T24;
{
E T1O, T1R, T1W, T1Z;
T1O = T1M + T1N;
T1R = T1P + T1Q;
T1S = T1O - T1R;
T22 = T1O + T1R;
T1W = T1U - T1V;
T1Z = T1X - T1Y;
T20 = T1W - T1Z;
T24 = T1W + T1Z;
}
{
E T1L, T1T, T21, T23;
T1L = W[2];
T1T = W[3];
cr[WS(rs, 2)] = FNMS(T1T, T20, T1L * T1S);
ci[WS(rs, 2)] = FMA(T1T, T1S, T1L * T20);
T21 = W[14];
T23 = W[15];
cr[WS(rs, 8)] = FNMS(T23, T24, T21 * T22);
ci[WS(rs, 8)] = FMA(T23, T22, T21 * T24);
}
}
{
E T1C, T1I, T1G, T1K;
{
E T1A, T1B, T1E, T1F;
T1A = T12 + T15;
T1B = T1p + T1s;
T1C = T1A - T1B;
T1I = T1A + T1B;
T1E = T1i + T1l;
T1F = T19 + T1c;
T1G = T1E - T1F;
T1K = T1E + T1F;
}
{
E T1z, T1D, T1H, T1J;
T1z = W[18];
T1D = W[19];
cr[WS(rs, 10)] = FNMS(T1D, T1G, T1z * T1C);
ci[WS(rs, 10)] = FMA(T1D, T1C, T1z * T1G);
T1H = W[6];
T1J = W[7];
cr[WS(rs, 4)] = FNMS(T1J, T1K, T1H * T1I);
ci[WS(rs, 4)] = FMA(T1J, T1I, T1H * T1K);
}
}
{
E T1e, T1w, T1u, T1y;
{
E T16, T1d, T1m, T1t;
T16 = T12 - T15;
T1d = T19 - T1c;
T1e = T16 - T1d;
T1w = T16 + T1d;
T1m = T1i - T1l;
T1t = T1p - T1s;
T1u = T1m + T1t;
T1y = T1m - T1t;
}
{
E TZ, T1f, T1v, T1x;
TZ = W[0];
T1f = W[1];
cr[WS(rs, 1)] = FNMS(T1f, T1u, TZ * T1e);
ci[WS(rs, 1)] = FMA(TZ, T1u, T1f * T1e);
T1v = W[12];
T1x = W[13];
cr[WS(rs, 7)] = FNMS(T1x, T1y, T1v * T1w);
ci[WS(rs, 7)] = FMA(T1v, T1y, T1x * T1w);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 12, "hb_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
void X(codelet_hb_12) (planner *p) {
X(khc2hc_register) (p, hb_12, &desc);
}
#endif

View File

@@ -0,0 +1,810 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:51 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include rdft/scalar/hb.h */
/*
* This function contains 184 FP additions, 140 FP multiplications,
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
* 78 stack variables, 6 constants, and 60 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T5, T11, T1C, T2U, T2f, T3f, TH, T19, T18, TS, T12, T13, T14, T3a, T3g;
E Ts, Tv, T37, T3h, T28, T2h, T21, T2g, T2V, T2W, T2X, T2Y, T2Z, T30, T31;
E T1F, T1I, T1J, T1M, T1P, T1Q, T1R;
{
E T1, TX, T4, T2e, T10, T1B, T1A, T2d;
T1 = cr[0];
TX = ci[WS(rs, 14)];
{
E T2, T3, TY, TZ;
T2 = cr[WS(rs, 5)];
T3 = ci[WS(rs, 4)];
T4 = T2 + T3;
T2e = T2 - T3;
TY = ci[WS(rs, 9)];
TZ = cr[WS(rs, 10)];
T10 = TY - TZ;
T1B = TY + TZ;
}
T5 = T1 + T4;
T11 = TX + T10;
T1A = FNMS(KP500000000, T4, T1);
T1C = FNMS(KP866025403, T1B, T1A);
T2U = FMA(KP866025403, T1B, T1A);
T2d = FNMS(KP500000000, T10, TX);
T2f = FMA(KP866025403, T2e, T2d);
T3f = FNMS(KP866025403, T2e, T2d);
}
{
E Ta, T1W, T1D, Tl, T23, T1K, Tf, T1Z, T1G, TR, T1Y, T1H, Tq, T26, T1N;
E TG, T25, T1O, TM, T1V, T1E, TB, T22, T1L, T38, T39;
{
E T6, T7, T8, T9;
T6 = cr[WS(rs, 3)];
T7 = ci[WS(rs, 6)];
T8 = ci[WS(rs, 1)];
T9 = T7 + T8;
Ta = T6 + T9;
T1W = T7 - T8;
T1D = FNMS(KP500000000, T9, T6);
}
{
E Th, Ti, Tj, Tk;
Th = cr[WS(rs, 6)];
Ti = ci[WS(rs, 3)];
Tj = cr[WS(rs, 1)];
Tk = Ti + Tj;
Tl = Th + Tk;
T23 = Ti - Tj;
T1K = FNMS(KP500000000, Tk, Th);
}
{
E Tb, Tc, Td, Te;
Tb = ci[WS(rs, 2)];
Tc = cr[WS(rs, 2)];
Td = cr[WS(rs, 7)];
Te = Tc + Td;
Tf = Tb + Te;
T1Z = Tc - Td;
T1G = FNMS(KP500000000, Te, Tb);
}
{
E TQ, TN, TO, TP;
TQ = cr[WS(rs, 12)];
TN = ci[WS(rs, 12)];
TO = ci[WS(rs, 7)];
TP = TN + TO;
TR = TP - TQ;
T1Y = FMA(KP500000000, TP, TQ);
T1H = TO - TN;
}
{
E Tm, Tn, To, Tp;
Tm = ci[WS(rs, 5)];
Tn = ci[0];
To = cr[WS(rs, 4)];
Tp = Tn + To;
Tq = Tm + Tp;
T26 = Tn - To;
T1N = FNMS(KP500000000, Tp, Tm);
}
{
E TF, TC, TD, TE;
TF = cr[WS(rs, 9)];
TC = ci[WS(rs, 10)];
TD = cr[WS(rs, 14)];
TE = TC - TD;
TG = TE - TF;
T25 = FMA(KP500000000, TE, TF);
T1O = TC + TD;
}
{
E TI, TJ, TK, TL;
TI = ci[WS(rs, 11)];
TJ = cr[WS(rs, 8)];
TK = cr[WS(rs, 13)];
TL = TJ + TK;
TM = TI - TL;
T1V = FMA(KP500000000, TL, TI);
T1E = TJ - TK;
}
{
E Tx, Ty, Tz, TA;
Tx = ci[WS(rs, 8)];
Ty = ci[WS(rs, 13)];
Tz = cr[WS(rs, 11)];
TA = Ty - Tz;
TB = Tx + TA;
T22 = FNMS(KP500000000, TA, Tx);
T1L = Ty + Tz;
}
TH = TB - TG;
T19 = Ta - Tf;
T18 = Tl - Tq;
TS = TM - TR;
T12 = TM + TR;
T13 = TB + TG;
T14 = T12 + T13;
T38 = FNMS(KP866025403, T1W, T1V);
T39 = FMA(KP866025403, T1Z, T1Y);
T3a = T38 + T39;
T3g = T38 - T39;
{
E Tg, Tr, T1X, T20;
Tg = Ta + Tf;
Tr = Tl + Tq;
Ts = Tg + Tr;
Tv = Tg - Tr;
{
E T35, T36, T24, T27;
T35 = FNMS(KP866025403, T23, T22);
T36 = FMA(KP866025403, T26, T25);
T37 = T35 + T36;
T3h = T35 - T36;
T24 = FMA(KP866025403, T23, T22);
T27 = FNMS(KP866025403, T26, T25);
T28 = T24 + T27;
T2h = T24 - T27;
}
T1X = FMA(KP866025403, T1W, T1V);
T20 = FNMS(KP866025403, T1Z, T1Y);
T21 = T1X + T20;
T2g = T1X - T20;
T2V = FNMS(KP866025403, T1E, T1D);
T2W = FNMS(KP866025403, T1H, T1G);
T2X = T2V + T2W;
T2Y = FNMS(KP866025403, T1L, T1K);
T2Z = FNMS(KP866025403, T1O, T1N);
T30 = T2Y + T2Z;
T31 = T2X + T30;
T1F = FMA(KP866025403, T1E, T1D);
T1I = FMA(KP866025403, T1H, T1G);
T1J = T1F + T1I;
T1M = FMA(KP866025403, T1L, T1K);
T1P = FMA(KP866025403, T1O, T1N);
T1Q = T1M + T1P;
T1R = T1J + T1Q;
}
}
cr[0] = T5 + Ts;
ci[0] = T11 + T14;
{
E T1a, T1q, T17, T1p, TU, T1u, T1e, T1m, T15, T16;
T1a = FNMS(KP618033988, T19, T18);
T1q = FMA(KP618033988, T18, T19);
T15 = FNMS(KP250000000, T14, T11);
T16 = T12 - T13;
T17 = FNMS(KP559016994, T16, T15);
T1p = FMA(KP559016994, T16, T15);
{
E TT, T1l, Tw, T1k, Tu;
TT = FNMS(KP618033988, TS, TH);
T1l = FMA(KP618033988, TH, TS);
Tu = FNMS(KP250000000, Ts, T5);
Tw = FNMS(KP559016994, Tv, Tu);
T1k = FMA(KP559016994, Tv, Tu);
TU = FNMS(KP951056516, TT, Tw);
T1u = FMA(KP951056516, T1l, T1k);
T1e = FMA(KP951056516, TT, Tw);
T1m = FNMS(KP951056516, T1l, T1k);
}
{
E T1b, TW, T1c, Tt, TV;
T1b = FMA(KP951056516, T1a, T17);
TW = W[5];
T1c = TW * TU;
Tt = W[4];
TV = Tt * TU;
cr[WS(rs, 3)] = FNMS(TW, T1b, TV);
ci[WS(rs, 3)] = FMA(Tt, T1b, T1c);
}
{
E T1x, T1w, T1y, T1t, T1v;
T1x = FNMS(KP951056516, T1q, T1p);
T1w = W[17];
T1y = T1w * T1u;
T1t = W[16];
T1v = T1t * T1u;
cr[WS(rs, 9)] = FNMS(T1w, T1x, T1v);
ci[WS(rs, 9)] = FMA(T1t, T1x, T1y);
}
{
E T1h, T1g, T1i, T1d, T1f;
T1h = FNMS(KP951056516, T1a, T17);
T1g = W[23];
T1i = T1g * T1e;
T1d = W[22];
T1f = T1d * T1e;
cr[WS(rs, 12)] = FNMS(T1g, T1h, T1f);
ci[WS(rs, 12)] = FMA(T1d, T1h, T1i);
}
{
E T1r, T1o, T1s, T1j, T1n;
T1r = FMA(KP951056516, T1q, T1p);
T1o = W[11];
T1s = T1o * T1m;
T1j = W[10];
T1n = T1j * T1m;
cr[WS(rs, 6)] = FNMS(T1o, T1r, T1n);
ci[WS(rs, 6)] = FMA(T1j, T1r, T1s);
}
}
{
E T2o, T2E, T2N, T2P, T2Q, T2S, T2l, T2R, T2D, T2a, T2I, T2s, T2A;
{
E T2m, T2n, T2O, T2k, T2i, T2j;
T2m = T1F - T1I;
T2n = T1M - T1P;
T2o = FMA(KP618033988, T2n, T2m);
T2E = FNMS(KP618033988, T2m, T2n);
T2O = T1C + T1R;
T2N = W[18];
T2P = T2N * T2O;
T2Q = W[19];
T2S = T2Q * T2O;
T2k = T2g - T2h;
T2i = T2g + T2h;
T2j = FNMS(KP250000000, T2i, T2f);
T2l = FMA(KP559016994, T2k, T2j);
T2R = T2f + T2i;
T2D = FNMS(KP559016994, T2k, T2j);
{
E T29, T2z, T1U, T2y, T1S, T1T;
T29 = FMA(KP618033988, T28, T21);
T2z = FNMS(KP618033988, T21, T28);
T1S = FNMS(KP250000000, T1R, T1C);
T1T = T1J - T1Q;
T1U = FMA(KP559016994, T1T, T1S);
T2y = FNMS(KP559016994, T1T, T1S);
T2a = FNMS(KP951056516, T29, T1U);
T2I = FNMS(KP951056516, T2z, T2y);
T2s = FMA(KP951056516, T29, T1U);
T2A = FMA(KP951056516, T2z, T2y);
}
}
cr[WS(rs, 10)] = FNMS(T2Q, T2R, T2P);
ci[WS(rs, 10)] = FMA(T2N, T2R, T2S);
{
E T2p, T2c, T2q, T1z, T2b;
T2p = FMA(KP951056516, T2o, T2l);
T2c = W[1];
T2q = T2c * T2a;
T1z = W[0];
T2b = T1z * T2a;
cr[WS(rs, 1)] = FNMS(T2c, T2p, T2b);
ci[WS(rs, 1)] = FMA(T1z, T2p, T2q);
}
{
E T2L, T2K, T2M, T2H, T2J;
T2L = FMA(KP951056516, T2E, T2D);
T2K = W[25];
T2M = T2K * T2I;
T2H = W[24];
T2J = T2H * T2I;
cr[WS(rs, 13)] = FNMS(T2K, T2L, T2J);
ci[WS(rs, 13)] = FMA(T2H, T2L, T2M);
}
{
E T2F, T2C, T2G, T2x, T2B;
T2F = FNMS(KP951056516, T2E, T2D);
T2C = W[13];
T2G = T2C * T2A;
T2x = W[12];
T2B = T2x * T2A;
cr[WS(rs, 7)] = FNMS(T2C, T2F, T2B);
ci[WS(rs, 7)] = FMA(T2x, T2F, T2G);
}
{
E T2v, T2u, T2w, T2r, T2t;
T2v = FNMS(KP951056516, T2o, T2l);
T2u = W[7];
T2w = T2u * T2s;
T2r = W[6];
T2t = T2r * T2s;
cr[WS(rs, 4)] = FNMS(T2u, T2v, T2t);
ci[WS(rs, 4)] = FMA(T2r, T2v, T2w);
}
}
{
E T3o, T3E, T3N, T3P, T3Q, T3S, T3l, T3R, T3D, T3c, T3I, T3s, T3A;
{
E T3m, T3n, T3O, T3k, T3i, T3j;
T3m = T2Y - T2Z;
T3n = T2V - T2W;
T3o = FNMS(KP618033988, T3n, T3m);
T3E = FMA(KP618033988, T3m, T3n);
T3O = T2U + T31;
T3N = W[8];
T3P = T3N * T3O;
T3Q = W[9];
T3S = T3Q * T3O;
T3k = T3g - T3h;
T3i = T3g + T3h;
T3j = FNMS(KP250000000, T3i, T3f);
T3l = FNMS(KP559016994, T3k, T3j);
T3R = T3f + T3i;
T3D = FMA(KP559016994, T3k, T3j);
{
E T3b, T3z, T34, T3y, T32, T33;
T3b = FNMS(KP618033988, T3a, T37);
T3z = FMA(KP618033988, T37, T3a);
T32 = FNMS(KP250000000, T31, T2U);
T33 = T2X - T30;
T34 = FNMS(KP559016994, T33, T32);
T3y = FMA(KP559016994, T33, T32);
T3c = FMA(KP951056516, T3b, T34);
T3I = FMA(KP951056516, T3z, T3y);
T3s = FNMS(KP951056516, T3b, T34);
T3A = FNMS(KP951056516, T3z, T3y);
}
}
cr[WS(rs, 5)] = FNMS(T3Q, T3R, T3P);
ci[WS(rs, 5)] = FMA(T3N, T3R, T3S);
{
E T3p, T3e, T3q, T2T, T3d;
T3p = FNMS(KP951056516, T3o, T3l);
T3e = W[3];
T3q = T3e * T3c;
T2T = W[2];
T3d = T2T * T3c;
cr[WS(rs, 2)] = FNMS(T3e, T3p, T3d);
ci[WS(rs, 2)] = FMA(T2T, T3p, T3q);
}
{
E T3L, T3K, T3M, T3H, T3J;
T3L = FNMS(KP951056516, T3E, T3D);
T3K = W[27];
T3M = T3K * T3I;
T3H = W[26];
T3J = T3H * T3I;
cr[WS(rs, 14)] = FNMS(T3K, T3L, T3J);
ci[WS(rs, 14)] = FMA(T3H, T3L, T3M);
}
{
E T3F, T3C, T3G, T3x, T3B;
T3F = FMA(KP951056516, T3E, T3D);
T3C = W[21];
T3G = T3C * T3A;
T3x = W[20];
T3B = T3x * T3A;
cr[WS(rs, 11)] = FNMS(T3C, T3F, T3B);
ci[WS(rs, 11)] = FMA(T3x, T3F, T3G);
}
{
E T3v, T3u, T3w, T3r, T3t;
T3v = FMA(KP951056516, T3o, T3l);
T3u = W[15];
T3w = T3u * T3s;
T3r = W[14];
T3t = T3r * T3s;
cr[WS(rs, 8)] = FNMS(T3u, T3v, T3t);
ci[WS(rs, 8)] = FMA(T3r, T3v, T3w);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, { 72, 28, 112, 0 } };
void X(codelet_hb_15) (planner *p) {
X(khc2hc_register) (p, hb_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include rdft/scalar/hb.h */
/*
* This function contains 184 FP additions, 112 FP multiplications,
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
* 75 stack variables, 6 constants, and 60 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T5, T10, T1J, T2C, T2c, T2M, TH, T18, T17, TS, T2Q, T2R, T2S, Tg, Tr;
E Ts, T11, T12, T13, T2N, T2O, T2P, T1u, T1x, T1y, T1W, T1Z, T28, T1P, T1S;
E T27, T1B, T1E, T1F, T2G, T2H, T2I, T2D, T2E, T2F;
{
E T1, TW, T4, T2a, TZ, T1I, T1H, T2b;
T1 = cr[0];
TW = ci[WS(rs, 14)];
{
E T2, T3, TX, TY;
T2 = cr[WS(rs, 5)];
T3 = ci[WS(rs, 4)];
T4 = T2 + T3;
T2a = KP866025403 * (T2 - T3);
TX = ci[WS(rs, 9)];
TY = cr[WS(rs, 10)];
TZ = TX - TY;
T1I = KP866025403 * (TX + TY);
}
T5 = T1 + T4;
T10 = TW + TZ;
T1H = FNMS(KP500000000, T4, T1);
T1J = T1H - T1I;
T2C = T1H + T1I;
T2b = FNMS(KP500000000, TZ, TW);
T2c = T2a + T2b;
T2M = T2b - T2a;
}
{
E Ta, T1N, T1s, Tl, T1U, T1z, Tf, T1Q, T1v, TG, T1R, T1w, Tq, T1X, T1C;
E TM, T1V, T1A, TB, T1O, T1t, TR, T1Y, T1D;
{
E T6, T7, T8, T9;
T6 = cr[WS(rs, 3)];
T7 = ci[WS(rs, 6)];
T8 = ci[WS(rs, 1)];
T9 = T7 + T8;
Ta = T6 + T9;
T1N = KP866025403 * (T7 - T8);
T1s = FNMS(KP500000000, T9, T6);
}
{
E Th, Ti, Tj, Tk;
Th = cr[WS(rs, 6)];
Ti = ci[WS(rs, 3)];
Tj = cr[WS(rs, 1)];
Tk = Ti + Tj;
Tl = Th + Tk;
T1U = KP866025403 * (Ti - Tj);
T1z = FNMS(KP500000000, Tk, Th);
}
{
E Tb, Tc, Td, Te;
Tb = ci[WS(rs, 2)];
Tc = cr[WS(rs, 2)];
Td = cr[WS(rs, 7)];
Te = Tc + Td;
Tf = Tb + Te;
T1Q = KP866025403 * (Tc - Td);
T1v = FNMS(KP500000000, Te, Tb);
}
{
E TF, TC, TD, TE;
TF = cr[WS(rs, 12)];
TC = ci[WS(rs, 12)];
TD = ci[WS(rs, 7)];
TE = TC + TD;
TG = TE - TF;
T1R = FMA(KP500000000, TE, TF);
T1w = KP866025403 * (TD - TC);
}
{
E Tm, Tn, To, Tp;
Tm = ci[WS(rs, 5)];
Tn = ci[0];
To = cr[WS(rs, 4)];
Tp = Tn + To;
Tq = Tm + Tp;
T1X = KP866025403 * (Tn - To);
T1C = FNMS(KP500000000, Tp, Tm);
}
{
E TI, TJ, TK, TL;
TI = ci[WS(rs, 8)];
TJ = ci[WS(rs, 13)];
TK = cr[WS(rs, 11)];
TL = TJ - TK;
TM = TI + TL;
T1V = FNMS(KP500000000, TL, TI);
T1A = KP866025403 * (TJ + TK);
}
{
E Tx, Ty, Tz, TA;
Tx = ci[WS(rs, 11)];
Ty = cr[WS(rs, 8)];
Tz = cr[WS(rs, 13)];
TA = Ty + Tz;
TB = Tx - TA;
T1O = FMA(KP500000000, TA, Tx);
T1t = KP866025403 * (Ty - Tz);
}
{
E TQ, TN, TO, TP;
TQ = cr[WS(rs, 9)];
TN = ci[WS(rs, 10)];
TO = cr[WS(rs, 14)];
TP = TN - TO;
TR = TP - TQ;
T1Y = FMA(KP500000000, TP, TQ);
T1D = KP866025403 * (TN + TO);
}
TH = TB - TG;
T18 = Tl - Tq;
T17 = Ta - Tf;
TS = TM - TR;
T2Q = T1V - T1U;
T2R = T1X + T1Y;
T2S = T2Q - T2R;
Tg = Ta + Tf;
Tr = Tl + Tq;
Ts = Tg + Tr;
T11 = TB + TG;
T12 = TM + TR;
T13 = T11 + T12;
T2N = T1O - T1N;
T2O = T1Q + T1R;
T2P = T2N - T2O;
T1u = T1s + T1t;
T1x = T1v + T1w;
T1y = T1u + T1x;
T1W = T1U + T1V;
T1Z = T1X - T1Y;
T28 = T1W + T1Z;
T1P = T1N + T1O;
T1S = T1Q - T1R;
T27 = T1P + T1S;
T1B = T1z + T1A;
T1E = T1C + T1D;
T1F = T1B + T1E;
T2G = T1z - T1A;
T2H = T1C - T1D;
T2I = T2G + T2H;
T2D = T1s - T1t;
T2E = T1v - T1w;
T2F = T2D + T2E;
}
cr[0] = T5 + Ts;
ci[0] = T10 + T13;
{
E TT, T19, T1k, T1h, T16, T1l, Tw, T1g;
TT = FNMS(KP951056516, TS, KP587785252 * TH);
T19 = FNMS(KP951056516, T18, KP587785252 * T17);
T1k = FMA(KP951056516, T17, KP587785252 * T18);
T1h = FMA(KP951056516, TH, KP587785252 * TS);
{
E T14, T15, Tu, Tv;
T14 = FNMS(KP250000000, T13, T10);
T15 = KP559016994 * (T11 - T12);
T16 = T14 - T15;
T1l = T15 + T14;
Tu = FNMS(KP250000000, Ts, T5);
Tv = KP559016994 * (Tg - Tr);
Tw = Tu - Tv;
T1g = Tv + Tu;
}
{
E TU, T1a, Tt, TV;
TU = Tw + TT;
T1a = T16 - T19;
Tt = W[4];
TV = W[5];
cr[WS(rs, 3)] = FNMS(TV, T1a, Tt * TU);
ci[WS(rs, 3)] = FMA(TV, TU, Tt * T1a);
}
{
E T1o, T1q, T1n, T1p;
T1o = T1g + T1h;
T1q = T1l - T1k;
T1n = W[16];
T1p = W[17];
cr[WS(rs, 9)] = FNMS(T1p, T1q, T1n * T1o);
ci[WS(rs, 9)] = FMA(T1p, T1o, T1n * T1q);
}
{
E T1c, T1e, T1b, T1d;
T1c = Tw - TT;
T1e = T19 + T16;
T1b = W[22];
T1d = W[23];
cr[WS(rs, 12)] = FNMS(T1d, T1e, T1b * T1c);
ci[WS(rs, 12)] = FMA(T1d, T1c, T1b * T1e);
}
{
E T1i, T1m, T1f, T1j;
T1i = T1g - T1h;
T1m = T1k + T1l;
T1f = W[10];
T1j = W[11];
cr[WS(rs, 6)] = FNMS(T1j, T1m, T1f * T1i);
ci[WS(rs, 6)] = FMA(T1j, T1i, T1f * T1m);
}
}
{
E T21, T2n, T26, T2q, T1M, T2y, T2m, T2f, T2A, T2r, T2x, T2z;
{
E T1T, T20, T24, T25;
T1T = T1P - T1S;
T20 = T1W - T1Z;
T21 = FMA(KP951056516, T1T, KP587785252 * T20);
T2n = FNMS(KP951056516, T20, KP587785252 * T1T);
T24 = T1u - T1x;
T25 = T1B - T1E;
T26 = FMA(KP951056516, T24, KP587785252 * T25);
T2q = FNMS(KP951056516, T25, KP587785252 * T24);
}
{
E T1G, T1K, T1L, T29, T2d, T2e;
T1G = KP559016994 * (T1y - T1F);
T1K = T1y + T1F;
T1L = FNMS(KP250000000, T1K, T1J);
T1M = T1G + T1L;
T2y = T1J + T1K;
T2m = T1L - T1G;
T29 = KP559016994 * (T27 - T28);
T2d = T27 + T28;
T2e = FNMS(KP250000000, T2d, T2c);
T2f = T29 + T2e;
T2A = T2c + T2d;
T2r = T2e - T29;
}
T2x = W[18];
T2z = W[19];
cr[WS(rs, 10)] = FNMS(T2z, T2A, T2x * T2y);
ci[WS(rs, 10)] = FMA(T2z, T2y, T2x * T2A);
{
E T2u, T2w, T2t, T2v;
T2u = T2m + T2n;
T2w = T2r - T2q;
T2t = W[24];
T2v = W[25];
cr[WS(rs, 13)] = FNMS(T2v, T2w, T2t * T2u);
ci[WS(rs, 13)] = FMA(T2v, T2u, T2t * T2w);
}
{
E T22, T2g, T1r, T23;
T22 = T1M - T21;
T2g = T26 + T2f;
T1r = W[0];
T23 = W[1];
cr[WS(rs, 1)] = FNMS(T23, T2g, T1r * T22);
ci[WS(rs, 1)] = FMA(T23, T22, T1r * T2g);
}
{
E T2i, T2k, T2h, T2j;
T2i = T1M + T21;
T2k = T2f - T26;
T2h = W[6];
T2j = W[7];
cr[WS(rs, 4)] = FNMS(T2j, T2k, T2h * T2i);
ci[WS(rs, 4)] = FMA(T2j, T2i, T2h * T2k);
}
{
E T2o, T2s, T2l, T2p;
T2o = T2m - T2n;
T2s = T2q + T2r;
T2l = W[12];
T2p = W[13];
cr[WS(rs, 7)] = FNMS(T2p, T2s, T2l * T2o);
ci[WS(rs, 7)] = FMA(T2p, T2o, T2l * T2s);
}
}
{
E T31, T3h, T36, T3k, T2K, T3g, T2Y, T2U, T3l, T39, T2B, T2L;
{
E T2Z, T30, T34, T35;
T2Z = T2N + T2O;
T30 = T2Q + T2R;
T31 = FNMS(KP951056516, T30, KP587785252 * T2Z);
T3h = FMA(KP951056516, T2Z, KP587785252 * T30);
T34 = T2D - T2E;
T35 = T2G - T2H;
T36 = FNMS(KP951056516, T35, KP587785252 * T34);
T3k = FMA(KP951056516, T34, KP587785252 * T35);
}
{
E T2X, T2J, T2W, T38, T2T, T37;
T2X = KP559016994 * (T2F - T2I);
T2J = T2F + T2I;
T2W = FNMS(KP250000000, T2J, T2C);
T2K = T2C + T2J;
T3g = T2X + T2W;
T2Y = T2W - T2X;
T38 = KP559016994 * (T2P - T2S);
T2T = T2P + T2S;
T37 = FNMS(KP250000000, T2T, T2M);
T2U = T2M + T2T;
T3l = T38 + T37;
T39 = T37 - T38;
}
T2B = W[8];
T2L = W[9];
cr[WS(rs, 5)] = FNMS(T2L, T2U, T2B * T2K);
ci[WS(rs, 5)] = FMA(T2L, T2K, T2B * T2U);
{
E T3o, T3q, T3n, T3p;
T3o = T3g + T3h;
T3q = T3l - T3k;
T3n = W[26];
T3p = W[27];
cr[WS(rs, 14)] = FNMS(T3p, T3q, T3n * T3o);
ci[WS(rs, 14)] = FMA(T3n, T3q, T3p * T3o);
}
{
E T32, T3a, T2V, T33;
T32 = T2Y - T31;
T3a = T36 + T39;
T2V = W[2];
T33 = W[3];
cr[WS(rs, 2)] = FNMS(T33, T3a, T2V * T32);
ci[WS(rs, 2)] = FMA(T2V, T3a, T33 * T32);
}
{
E T3c, T3e, T3b, T3d;
T3c = T2Y + T31;
T3e = T39 - T36;
T3b = W[14];
T3d = W[15];
cr[WS(rs, 8)] = FNMS(T3d, T3e, T3b * T3c);
ci[WS(rs, 8)] = FMA(T3b, T3e, T3d * T3c);
}
{
E T3i, T3m, T3f, T3j;
T3i = T3g - T3h;
T3m = T3k + T3l;
T3f = W[20];
T3j = W[21];
cr[WS(rs, 11)] = FNMS(T3j, T3m, T3f * T3i);
ci[WS(rs, 11)] = FMA(T3f, T3m, T3j * T3i);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, { 128, 56, 56, 0 } };
void X(codelet_hb_15) (planner *p) {
X(khc2hc_register) (p, hb_15, &desc);
}
#endif

View File

@@ -0,0 +1,833 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:51 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include rdft/scalar/hb.h */
/*
* This function contains 174 FP additions, 100 FP multiplications,
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
* 63 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E TA, T1O, T21, T1h, T2P, T2S, T3b, T3p, T3q, T3D, T1k, T1P, Tf, T3y, T2A;
E T36, TL, T22, T3s, T3t, T3z, T2F, T2U, T2K, T2V, Tu, T3E, TX, T1n, T1T;
E T24, T1W, T25, T18, T1m;
{
E T3, Tw, TJ, T2x, T1g, T2Q, T6, T1d, Ta, TB, Tz, T2R, TE, T2y, Td;
E TG;
{
E T1, T2, TH, TI;
T1 = cr[0];
T2 = ci[WS(rs, 7)];
T3 = T1 + T2;
Tw = T1 - T2;
TH = ci[WS(rs, 9)];
TI = cr[WS(rs, 14)];
TJ = TH + TI;
T2x = TH - TI;
}
{
E T1e, T1f, T4, T5;
T1e = ci[WS(rs, 15)];
T1f = cr[WS(rs, 8)];
T1g = T1e + T1f;
T2Q = T1e - T1f;
T4 = cr[WS(rs, 4)];
T5 = ci[WS(rs, 3)];
T6 = T4 + T5;
T1d = T4 - T5;
}
{
E T8, T9, Tx, Ty;
T8 = cr[WS(rs, 2)];
T9 = ci[WS(rs, 5)];
Ta = T8 + T9;
TB = T8 - T9;
Tx = ci[WS(rs, 11)];
Ty = cr[WS(rs, 12)];
Tz = Tx + Ty;
T2R = Tx - Ty;
}
{
E TC, TD, Tb, Tc;
TC = ci[WS(rs, 13)];
TD = cr[WS(rs, 10)];
TE = TC + TD;
T2y = TC - TD;
Tb = ci[WS(rs, 1)];
Tc = cr[WS(rs, 6)];
Td = Tb + Tc;
TG = Tb - Tc;
}
TA = Tw - Tz;
T1O = Tw + Tz;
T21 = T1g - T1d;
T1h = T1d + T1g;
T2P = Ta - Td;
T2S = T2Q - T2R;
T3b = T2S - T2P;
{
E T1i, T1j, T7, Te;
T3p = T2Q + T2R;
T3q = T2y + T2x;
T3D = T3p - T3q;
T1i = TB + TE;
T1j = TG + TJ;
T1k = T1i - T1j;
T1P = T1i + T1j;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T3y = T7 - Te;
{
E T2w, T2z, TF, TK;
T2w = T3 - T6;
T2z = T2x - T2y;
T2A = T2w + T2z;
T36 = T2w - T2z;
TF = TB - TE;
TK = TG - TJ;
TL = TF + TK;
T22 = TF - TK;
}
}
}
{
E Ti, T13, T11, T2C, T16, T2D, Tl, TY, Tp, TS, TQ, T2H, TV, T2I, Ts;
E TN, T2B, T2E;
{
E Tg, Th, TZ, T10;
Tg = cr[WS(rs, 1)];
Th = ci[WS(rs, 6)];
Ti = Tg + Th;
T13 = Tg - Th;
TZ = ci[WS(rs, 14)];
T10 = cr[WS(rs, 9)];
T11 = TZ + T10;
T2C = TZ - T10;
}
{
E T14, T15, Tj, Tk;
T14 = ci[WS(rs, 10)];
T15 = cr[WS(rs, 13)];
T16 = T14 + T15;
T2D = T14 - T15;
Tj = cr[WS(rs, 5)];
Tk = ci[WS(rs, 2)];
Tl = Tj + Tk;
TY = Tj - Tk;
}
{
E Tn, To, TO, TP;
Tn = ci[0];
To = cr[WS(rs, 7)];
Tp = Tn + To;
TS = Tn - To;
TO = ci[WS(rs, 8)];
TP = cr[WS(rs, 15)];
TQ = TO + TP;
T2H = TO - TP;
}
{
E TT, TU, Tq, Tr;
TT = ci[WS(rs, 12)];
TU = cr[WS(rs, 11)];
TV = TT + TU;
T2I = TT - TU;
Tq = cr[WS(rs, 3)];
Tr = ci[WS(rs, 4)];
Ts = Tq + Tr;
TN = Tq - Tr;
}
T3s = T2C + T2D;
T3t = T2H + T2I;
T3z = T3t - T3s;
T2B = Ti - Tl;
T2E = T2C - T2D;
T2F = T2B - T2E;
T2U = T2B + T2E;
{
E T2G, T2J, Tm, Tt;
T2G = Tp - Ts;
T2J = T2H - T2I;
T2K = T2G + T2J;
T2V = T2J - T2G;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T3E = Tm - Tt;
}
{
E TR, TW, T1R, T1S;
TR = TN - TQ;
TW = TS - TV;
TX = FNMS(KP414213562, TW, TR);
T1n = FMA(KP414213562, TR, TW);
T1R = T11 - TY;
T1S = T13 + T16;
T1T = FNMS(KP414213562, T1S, T1R);
T24 = FMA(KP414213562, T1R, T1S);
}
{
E T1U, T1V, T12, T17;
T1U = TN + TQ;
T1V = TS + TV;
T1W = FNMS(KP414213562, T1V, T1U);
T25 = FMA(KP414213562, T1U, T1V);
T12 = TY + T11;
T17 = T13 - T16;
T18 = FMA(KP414213562, T17, T12);
T1m = FNMS(KP414213562, T12, T17);
}
}
cr[0] = Tf + Tu;
{
E T3r, T3u, T3v, T3l, T3n, T3o, T3w, T3m;
T3r = T3p + T3q;
T3u = T3s + T3t;
T3v = T3r - T3u;
T3m = Tf - Tu;
T3l = W[14];
T3n = T3l * T3m;
T3o = W[15];
T3w = T3o * T3m;
ci[0] = T3r + T3u;
ci[WS(rs, 8)] = FMA(T3l, T3v, T3w);
cr[WS(rs, 8)] = FNMS(T3o, T3v, T3n);
}
{
E T3A, T3F, T3B, T3G, T3x, T3C;
T3A = T3y - T3z;
T3F = T3D - T3E;
T3x = W[22];
T3B = T3x * T3A;
T3G = T3x * T3F;
T3C = W[23];
cr[WS(rs, 12)] = FNMS(T3C, T3F, T3B);
ci[WS(rs, 12)] = FMA(T3C, T3A, T3G);
}
{
E T3I, T3L, T3J, T3M, T3H, T3K;
T3I = T3y + T3z;
T3L = T3E + T3D;
T3H = W[6];
T3J = T3H * T3I;
T3M = T3H * T3L;
T3K = W[7];
cr[WS(rs, 4)] = FNMS(T3K, T3L, T3J);
ci[WS(rs, 4)] = FMA(T3K, T3I, T3M);
}
{
E T38, T3g, T3d, T3j, T37, T3c;
T37 = T2V - T2U;
T38 = FNMS(KP707106781, T37, T36);
T3g = FMA(KP707106781, T37, T36);
T3c = T2F - T2K;
T3d = FNMS(KP707106781, T3c, T3b);
T3j = FMA(KP707106781, T3c, T3b);
{
E T39, T3e, T35, T3a;
T35 = W[26];
T39 = T35 * T38;
T3e = T35 * T3d;
T3a = W[27];
cr[WS(rs, 14)] = FNMS(T3a, T3d, T39);
ci[WS(rs, 14)] = FMA(T3a, T38, T3e);
}
{
E T3h, T3k, T3f, T3i;
T3f = W[10];
T3h = T3f * T3g;
T3k = T3f * T3j;
T3i = W[11];
cr[WS(rs, 6)] = FNMS(T3i, T3j, T3h);
ci[WS(rs, 6)] = FMA(T3i, T3g, T3k);
}
}
{
E T2M, T30, T2X, T33, T2L, T2T, T2W;
T2L = T2F + T2K;
T2M = FNMS(KP707106781, T2L, T2A);
T30 = FMA(KP707106781, T2L, T2A);
T2T = T2P + T2S;
T2W = T2U + T2V;
T2X = FNMS(KP707106781, T2W, T2T);
T33 = FMA(KP707106781, T2W, T2T);
{
E T2v, T2N, T2O, T2Y;
T2v = W[18];
T2N = T2v * T2M;
T2O = W[19];
T2Y = T2O * T2M;
cr[WS(rs, 10)] = FNMS(T2O, T2X, T2N);
ci[WS(rs, 10)] = FMA(T2v, T2X, T2Y);
}
{
E T2Z, T31, T32, T34;
T2Z = W[2];
T31 = T2Z * T30;
T32 = W[3];
T34 = T32 * T30;
cr[WS(rs, 2)] = FNMS(T32, T33, T31);
ci[WS(rs, 2)] = FMA(T2Z, T33, T34);
}
}
{
E T1Y, T2a, T27, T2d;
{
E T1Q, T1X, T23, T26;
T1Q = FNMS(KP707106781, T1P, T1O);
T1X = T1T + T1W;
T1Y = FMA(KP923879532, T1X, T1Q);
T2a = FNMS(KP923879532, T1X, T1Q);
T23 = FMA(KP707106781, T22, T21);
T26 = T24 - T25;
T27 = FNMS(KP923879532, T26, T23);
T2d = FMA(KP923879532, T26, T23);
}
{
E T1N, T1Z, T20, T28;
T1N = W[20];
T1Z = T1N * T1Y;
T20 = W[21];
T28 = T20 * T1Y;
cr[WS(rs, 11)] = FNMS(T20, T27, T1Z);
ci[WS(rs, 11)] = FMA(T1N, T27, T28);
}
{
E T29, T2b, T2c, T2e;
T29 = W[4];
T2b = T29 * T2a;
T2c = W[5];
T2e = T2c * T2a;
cr[WS(rs, 3)] = FNMS(T2c, T2d, T2b);
ci[WS(rs, 3)] = FMA(T29, T2d, T2e);
}
}
{
E T1a, T1s, T1p, T1v;
{
E TM, T19, T1l, T1o;
TM = FNMS(KP707106781, TL, TA);
T19 = TX - T18;
T1a = FNMS(KP923879532, T19, TM);
T1s = FMA(KP923879532, T19, TM);
T1l = FNMS(KP707106781, T1k, T1h);
T1o = T1m - T1n;
T1p = FNMS(KP923879532, T1o, T1l);
T1v = FMA(KP923879532, T1o, T1l);
}
{
E Tv, T1b, T1c, T1q;
Tv = W[24];
T1b = Tv * T1a;
T1c = W[25];
T1q = T1c * T1a;
cr[WS(rs, 13)] = FNMS(T1c, T1p, T1b);
ci[WS(rs, 13)] = FMA(Tv, T1p, T1q);
}
{
E T1r, T1t, T1u, T1w;
T1r = W[8];
T1t = T1r * T1s;
T1u = W[9];
T1w = T1u * T1s;
cr[WS(rs, 5)] = FNMS(T1u, T1v, T1t);
ci[WS(rs, 5)] = FMA(T1r, T1v, T1w);
}
}
{
E T2i, T2q, T2n, T2t;
{
E T2g, T2h, T2l, T2m;
T2g = FMA(KP707106781, T1P, T1O);
T2h = T24 + T25;
T2i = FNMS(KP923879532, T2h, T2g);
T2q = FMA(KP923879532, T2h, T2g);
T2l = FNMS(KP707106781, T22, T21);
T2m = T1W - T1T;
T2n = FMA(KP923879532, T2m, T2l);
T2t = FNMS(KP923879532, T2m, T2l);
}
{
E T2j, T2o, T2f, T2k;
T2f = W[12];
T2j = T2f * T2i;
T2o = T2f * T2n;
T2k = W[13];
cr[WS(rs, 7)] = FNMS(T2k, T2n, T2j);
ci[WS(rs, 7)] = FMA(T2k, T2i, T2o);
}
{
E T2r, T2u, T2p, T2s;
T2p = W[28];
T2r = T2p * T2q;
T2u = T2p * T2t;
T2s = W[29];
cr[WS(rs, 15)] = FNMS(T2s, T2t, T2r);
ci[WS(rs, 15)] = FMA(T2s, T2q, T2u);
}
}
{
E T1A, T1I, T1F, T1L;
{
E T1y, T1z, T1D, T1E;
T1y = FMA(KP707106781, TL, TA);
T1z = T1m + T1n;
T1A = FNMS(KP923879532, T1z, T1y);
T1I = FMA(KP923879532, T1z, T1y);
T1D = FMA(KP707106781, T1k, T1h);
T1E = T18 + TX;
T1F = FNMS(KP923879532, T1E, T1D);
T1L = FMA(KP923879532, T1E, T1D);
}
{
E T1B, T1G, T1x, T1C;
T1x = W[16];
T1B = T1x * T1A;
T1G = T1x * T1F;
T1C = W[17];
cr[WS(rs, 9)] = FNMS(T1C, T1F, T1B);
ci[WS(rs, 9)] = FMA(T1C, T1A, T1G);
}
{
E T1J, T1M, T1H, T1K;
T1H = W[0];
T1J = T1H * T1I;
T1M = T1H * T1L;
T1K = W[1];
cr[WS(rs, 1)] = FNMS(T1K, T1L, T1J);
ci[WS(rs, 1)] = FMA(T1K, T1I, T1M);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
void X(codelet_hb_16) (planner *p) {
X(khc2hc_register) (p, hb_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include rdft/scalar/hb.h */
/*
* This function contains 174 FP additions, 84 FP multiplications,
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
* 50 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
E T2S, T30;
{
E T3, T1Q, T16, T1R, T6, T2i, T13, T2j;
{
E T1, T2, T14, T15;
T1 = cr[0];
T2 = ci[WS(rs, 7)];
T3 = T1 + T2;
T1Q = T1 - T2;
T14 = ci[WS(rs, 11)];
T15 = cr[WS(rs, 12)];
T16 = T14 - T15;
T1R = T14 + T15;
}
{
E T4, T5, T11, T12;
T4 = cr[WS(rs, 4)];
T5 = ci[WS(rs, 3)];
T6 = T4 + T5;
T2i = T4 - T5;
T11 = ci[WS(rs, 15)];
T12 = cr[WS(rs, 8)];
T13 = T11 - T12;
T2j = T11 + T12;
}
T7 = T3 + T6;
T2K = T1Q + T1R;
T2W = T2j - T2i;
Tw = T3 - T6;
T17 = T13 - T16;
T1S = T1Q - T1R;
T2k = T2i + T2j;
T1w = T13 + T16;
}
{
E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
{
E T8, T9, TA, TB;
T8 = cr[WS(rs, 2)];
T9 = ci[WS(rs, 5)];
Ta = T8 + T9;
T1T = T8 - T9;
TA = ci[WS(rs, 13)];
TB = cr[WS(rs, 10)];
TC = TA - TB;
T1U = TA + TB;
}
{
E Tb, Tc, Tx, Ty;
Tb = ci[WS(rs, 1)];
Tc = cr[WS(rs, 6)];
Td = Tb + Tc;
T1W = Tb - Tc;
Tx = ci[WS(rs, 9)];
Ty = cr[WS(rs, 14)];
Tz = Tx - Ty;
T1X = Tx + Ty;
}
Te = Ta + Td;
TD = Tz - TC;
T1x = TC + Tz;
T10 = Ta - Td;
{
E T2l, T2m, T1V, T1Y;
T2l = T1T + T1U;
T2m = T1W + T1X;
T2n = KP707106781 * (T2l - T2m);
T2L = KP707106781 * (T2l + T2m);
T1V = T1T - T1U;
T1Y = T1W - T1X;
T1Z = KP707106781 * (T1V + T1Y);
T2X = KP707106781 * (T1V - T1Y);
}
}
{
E Ti, T2b, TL, T2c, Tl, T28, TI, T29, TF, TM;
{
E Tg, Th, TJ, TK;
Tg = cr[WS(rs, 1)];
Th = ci[WS(rs, 6)];
Ti = Tg + Th;
T2b = Tg - Th;
TJ = ci[WS(rs, 10)];
TK = cr[WS(rs, 13)];
TL = TJ - TK;
T2c = TJ + TK;
}
{
E Tj, Tk, TG, TH;
Tj = cr[WS(rs, 5)];
Tk = ci[WS(rs, 2)];
Tl = Tj + Tk;
T28 = Tj - Tk;
TG = ci[WS(rs, 14)];
TH = cr[WS(rs, 9)];
TI = TG - TH;
T29 = TG + TH;
}
Tm = Ti + Tl;
T1z = TI + TL;
TF = Ti - Tl;
TM = TI - TL;
TN = TF - TM;
T19 = TF + TM;
{
E T2a, T2d, T2N, T2O;
T2a = T28 + T29;
T2d = T2b - T2c;
T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
T2N = T2b + T2c;
T2O = T29 - T28;
T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
}
}
{
E Tp, T24, TU, T25, Ts, T21, TR, T22, TO, TV;
{
E Tn, To, TS, TT;
Tn = ci[0];
To = cr[WS(rs, 7)];
Tp = Tn + To;
T24 = Tn - To;
TS = ci[WS(rs, 12)];
TT = cr[WS(rs, 11)];
TU = TS - TT;
T25 = TS + TT;
}
{
E Tq, Tr, TP, TQ;
Tq = cr[WS(rs, 3)];
Tr = ci[WS(rs, 4)];
Ts = Tq + Tr;
T21 = Tq - Tr;
TP = ci[WS(rs, 8)];
TQ = cr[WS(rs, 15)];
TR = TP - TQ;
T22 = TP + TQ;
}
Tt = Tp + Ts;
T1A = TR + TU;
TO = Tp - Ts;
TV = TR - TU;
TW = TO + TV;
T1a = TV - TO;
{
E T23, T26, T2Q, T2R;
T23 = T21 - T22;
T26 = T24 - T25;
T27 = FNMS(KP382683432, T26, KP923879532 * T23);
T2q = FMA(KP382683432, T23, KP923879532 * T26);
T2Q = T24 + T25;
T2R = T21 + T22;
T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
}
}
{
E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
Tf = T7 + Te;
Tu = Tm + Tt;
T1u = Tf - Tu;
T1y = T1w + T1x;
T1B = T1z + T1A;
T1C = T1y - T1B;
cr[0] = Tf + Tu;
ci[0] = T1y + T1B;
T1t = W[14];
T1v = W[15];
cr[WS(rs, 8)] = FNMS(T1v, T1C, T1t * T1u);
ci[WS(rs, 8)] = FMA(T1v, T1u, T1t * T1C);
}
{
E T2U, T34, T32, T36;
{
E T2M, T2T, T2Y, T31;
T2M = T2K - T2L;
T2T = T2P + T2S;
T2U = T2M - T2T;
T34 = T2M + T2T;
T2Y = T2W + T2X;
T31 = T2Z - T30;
T32 = T2Y - T31;
T36 = T2Y + T31;
}
{
E T2J, T2V, T33, T35;
T2J = W[20];
T2V = W[21];
cr[WS(rs, 11)] = FNMS(T2V, T32, T2J * T2U);
ci[WS(rs, 11)] = FMA(T2V, T2U, T2J * T32);
T33 = W[4];
T35 = W[5];
cr[WS(rs, 3)] = FNMS(T35, T36, T33 * T34);
ci[WS(rs, 3)] = FMA(T35, T34, T33 * T36);
}
}
{
E T3a, T3g, T3e, T3i;
{
E T38, T39, T3c, T3d;
T38 = T2K + T2L;
T39 = T2Z + T30;
T3a = T38 - T39;
T3g = T38 + T39;
T3c = T2W - T2X;
T3d = T2P - T2S;
T3e = T3c + T3d;
T3i = T3c - T3d;
}
{
E T37, T3b, T3f, T3h;
T37 = W[12];
T3b = W[13];
cr[WS(rs, 7)] = FNMS(T3b, T3e, T37 * T3a);
ci[WS(rs, 7)] = FMA(T37, T3e, T3b * T3a);
T3f = W[28];
T3h = W[29];
cr[WS(rs, 15)] = FNMS(T3h, T3i, T3f * T3g);
ci[WS(rs, 15)] = FMA(T3f, T3i, T3h * T3g);
}
}
{
E TY, T1e, T1c, T1g;
{
E TE, TX, T18, T1b;
TE = Tw + TD;
TX = KP707106781 * (TN + TW);
TY = TE - TX;
T1e = TE + TX;
T18 = T10 + T17;
T1b = KP707106781 * (T19 + T1a);
T1c = T18 - T1b;
T1g = T18 + T1b;
}
{
E Tv, TZ, T1d, T1f;
Tv = W[18];
TZ = W[19];
cr[WS(rs, 10)] = FNMS(TZ, T1c, Tv * TY);
ci[WS(rs, 10)] = FMA(TZ, TY, Tv * T1c);
T1d = W[2];
T1f = W[3];
cr[WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
ci[WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
}
}
{
E T1k, T1q, T1o, T1s;
{
E T1i, T1j, T1m, T1n;
T1i = Tw - TD;
T1j = KP707106781 * (T1a - T19);
T1k = T1i - T1j;
T1q = T1i + T1j;
T1m = T17 - T10;
T1n = KP707106781 * (TN - TW);
T1o = T1m - T1n;
T1s = T1m + T1n;
}
{
E T1h, T1l, T1p, T1r;
T1h = W[26];
T1l = W[27];
cr[WS(rs, 14)] = FNMS(T1l, T1o, T1h * T1k);
ci[WS(rs, 14)] = FMA(T1h, T1o, T1l * T1k);
T1p = W[10];
T1r = W[11];
cr[WS(rs, 6)] = FNMS(T1r, T1s, T1p * T1q);
ci[WS(rs, 6)] = FMA(T1p, T1s, T1r * T1q);
}
}
{
E T2g, T2u, T2s, T2w;
{
E T20, T2f, T2o, T2r;
T20 = T1S - T1Z;
T2f = T27 - T2e;
T2g = T20 - T2f;
T2u = T20 + T2f;
T2o = T2k - T2n;
T2r = T2p - T2q;
T2s = T2o - T2r;
T2w = T2o + T2r;
}
{
E T1P, T2h, T2t, T2v;
T1P = W[24];
T2h = W[25];
cr[WS(rs, 13)] = FNMS(T2h, T2s, T1P * T2g);
ci[WS(rs, 13)] = FMA(T2h, T2g, T1P * T2s);
T2t = W[8];
T2v = W[9];
cr[WS(rs, 5)] = FNMS(T2v, T2w, T2t * T2u);
ci[WS(rs, 5)] = FMA(T2v, T2u, T2t * T2w);
}
}
{
E T2A, T2G, T2E, T2I;
{
E T2y, T2z, T2C, T2D;
T2y = T1S + T1Z;
T2z = T2p + T2q;
T2A = T2y - T2z;
T2G = T2y + T2z;
T2C = T2k + T2n;
T2D = T2e + T27;
T2E = T2C - T2D;
T2I = T2C + T2D;
}
{
E T2x, T2B, T2F, T2H;
T2x = W[16];
T2B = W[17];
cr[WS(rs, 9)] = FNMS(T2B, T2E, T2x * T2A);
ci[WS(rs, 9)] = FMA(T2x, T2E, T2B * T2A);
T2F = W[0];
T2H = W[1];
cr[WS(rs, 1)] = FNMS(T2H, T2I, T2F * T2G);
ci[WS(rs, 1)] = FMA(T2F, T2I, T2H * T2G);
}
}
{
E T1G, T1M, T1K, T1O;
{
E T1E, T1F, T1I, T1J;
T1E = T7 - Te;
T1F = T1A - T1z;
T1G = T1E - T1F;
T1M = T1E + T1F;
T1I = T1w - T1x;
T1J = Tm - Tt;
T1K = T1I - T1J;
T1O = T1J + T1I;
}
{
E T1D, T1H, T1L, T1N;
T1D = W[22];
T1H = W[23];
cr[WS(rs, 12)] = FNMS(T1H, T1K, T1D * T1G);
ci[WS(rs, 12)] = FMA(T1D, T1K, T1H * T1G);
T1L = W[6];
T1N = W[7];
cr[WS(rs, 4)] = FNMS(T1N, T1O, T1L * T1M);
ci[WS(rs, 4)] = FMA(T1L, T1O, T1N * T1M);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
void X(codelet_hb_16) (planner *p) {
X(khc2hc_register) (p, hb_16, &desc);
}
#endif

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hb_2 -include rdft/scalar/hb.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 11 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, T2, T6, T3, T4, T9;
T1 = cr[0];
T2 = ci[0];
T6 = T1 - T2;
T3 = ci[WS(rs, 1)];
T4 = cr[WS(rs, 1)];
T9 = T3 + T4;
cr[0] = T1 + T2;
ci[0] = T3 - T4;
{
E T5, T7, T8, Ta;
T5 = W[0];
T7 = T5 * T6;
T8 = W[1];
Ta = T8 * T6;
cr[WS(rs, 1)] = FNMS(T8, T9, T7);
ci[WS(rs, 1)] = FMA(T5, T9, Ta);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 2, "hb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hb_2) (planner *p) {
X(khc2hc_register) (p, hb_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hb_2 -include rdft/scalar/hb.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, T2, T6, T3, T4, T8, T5, T7;
T1 = cr[0];
T2 = ci[0];
T6 = T1 - T2;
T3 = ci[WS(rs, 1)];
T4 = cr[WS(rs, 1)];
T8 = T3 + T4;
cr[0] = T1 + T2;
ci[0] = T3 - T4;
T5 = W[0];
T7 = W[1];
cr[WS(rs, 1)] = FNMS(T7, T8, T5 * T6);
ci[WS(rs, 1)] = FMA(T7, T6, T5 * T8);
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 2, "hb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hb_2) (planner *p) {
X(khc2hc_register) (p, hb_2, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,166 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -dif -name hb_3 -include rdft/scalar/hb.h */
/*
* This function contains 16 FP additions, 14 FP multiplications,
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
* 17 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, T4, T6, Tg, Td, Te, T9, Tf;
{
E T2, T3, T7, T8;
T1 = cr[0];
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
T6 = FNMS(KP500000000, T4, T1);
Tg = T2 - T3;
Td = ci[WS(rs, 2)];
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 2)];
Te = T7 - T8;
T9 = T7 + T8;
Tf = FNMS(KP500000000, Te, Td);
}
cr[0] = T1 + T4;
ci[0] = Td + Te;
{
E Th, T5, Tb, Tc, Ti, Ta;
Th = FMA(KP866025403, Tg, Tf);
Ta = FNMS(KP866025403, T9, T6);
T5 = W[0];
Tb = T5 * Ta;
Tc = W[1];
Ti = Tc * Ta;
cr[WS(rs, 1)] = FNMS(Tc, Th, Tb);
ci[WS(rs, 1)] = FMA(T5, Th, Ti);
}
{
E Tn, Tj, Tl, Tm, To, Tk;
Tn = FNMS(KP866025403, Tg, Tf);
Tk = FMA(KP866025403, T9, T6);
Tj = W[2];
Tl = Tj * Tk;
Tm = W[3];
To = Tm * Tk;
cr[WS(rs, 2)] = FNMS(Tm, Tn, Tl);
ci[WS(rs, 2)] = FMA(Tj, Tn, To);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 3, "hb_3", twinstr, &GENUS, { 6, 4, 10, 0 } };
void X(codelet_hb_3) (planner *p) {
X(khc2hc_register) (p, hb_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 3 -dif -name hb_3 -include rdft/scalar/hb.h */
/*
* This function contains 16 FP additions, 12 FP multiplications,
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, T4, Ta, Te, T5, T8, Tb, Tf;
{
E T2, T3, T6, T7;
T1 = cr[0];
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
Ta = FNMS(KP500000000, T4, T1);
Te = KP866025403 * (T2 - T3);
T5 = ci[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = cr[WS(rs, 2)];
T8 = T6 - T7;
Tb = KP866025403 * (T6 + T7);
Tf = FNMS(KP500000000, T8, T5);
}
cr[0] = T1 + T4;
ci[0] = T5 + T8;
{
E Tc, Tg, T9, Td;
Tc = Ta - Tb;
Tg = Te + Tf;
T9 = W[0];
Td = W[1];
cr[WS(rs, 1)] = FNMS(Td, Tg, T9 * Tc);
ci[WS(rs, 1)] = FMA(T9, Tg, Td * Tc);
}
{
E Ti, Tk, Th, Tj;
Ti = Ta + Tb;
Tk = Tf - Te;
Th = W[2];
Tj = W[3];
cr[WS(rs, 2)] = FNMS(Tj, Tk, Th * Ti);
ci[WS(rs, 2)] = FMA(Th, Tk, Tj * Ti);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 3, "hb_3", twinstr, &GENUS, { 10, 6, 6, 0 } };
void X(codelet_hb_3) (planner *p) {
X(khc2hc_register) (p, hb_3, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hb_4 -include rdft/scalar/hb.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 22 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T3, T6, T8, Td, Tx, Tu, Tm, Tg, Tr;
{
E Tb, Tc, Tq, Te, Tf, Tl, Tk, Tp;
Tb = ci[WS(rs, 3)];
Tc = cr[WS(rs, 2)];
Tq = Tb + Tc;
Te = ci[WS(rs, 2)];
Tf = cr[WS(rs, 3)];
Tl = Te + Tf;
{
E T1, T2, T4, T5;
T1 = cr[0];
T2 = ci[WS(rs, 1)];
T3 = T1 + T2;
Tk = T1 - T2;
T4 = cr[WS(rs, 1)];
T5 = ci[0];
T6 = T4 + T5;
Tp = T4 - T5;
}
T8 = T3 - T6;
Td = Tb - Tc;
Tx = Tq - Tp;
Tu = Tk + Tl;
Tm = Tk - Tl;
Tg = Te - Tf;
Tr = Tp + Tq;
}
cr[0] = T3 + T6;
ci[0] = Td + Tg;
{
E Tn, Ts, Tj, To;
Tj = W[0];
Tn = Tj * Tm;
Ts = Tj * Tr;
To = W[1];
cr[WS(rs, 1)] = FNMS(To, Tr, Tn);
ci[WS(rs, 1)] = FMA(To, Tm, Ts);
}
{
E Tv, Ty, Tt, Tw;
Tt = W[4];
Tv = Tt * Tu;
Ty = Tt * Tx;
Tw = W[5];
cr[WS(rs, 3)] = FNMS(Tw, Tx, Tv);
ci[WS(rs, 3)] = FMA(Tw, Tu, Ty);
}
{
E Th, Ta, Ti, T7, T9;
Th = Td - Tg;
Ta = W[3];
Ti = Ta * T8;
T7 = W[2];
T9 = T7 * T8;
cr[WS(rs, 2)] = FNMS(Ta, Th, T9);
ci[WS(rs, 2)] = FMA(T7, Th, Ti);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hb_4) (planner *p) {
X(khc2hc_register) (p, hb_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hb_4 -include rdft/scalar/hb.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T3, Ti, T6, Tm, Tc, Tn, Tf, Tj;
{
E T1, T2, T4, T5;
T1 = cr[0];
T2 = ci[WS(rs, 1)];
T3 = T1 + T2;
Ti = T1 - T2;
T4 = cr[WS(rs, 1)];
T5 = ci[0];
T6 = T4 + T5;
Tm = T4 - T5;
}
{
E Ta, Tb, Td, Te;
Ta = ci[WS(rs, 3)];
Tb = cr[WS(rs, 2)];
Tc = Ta - Tb;
Tn = Ta + Tb;
Td = ci[WS(rs, 2)];
Te = cr[WS(rs, 3)];
Tf = Td - Te;
Tj = Td + Te;
}
cr[0] = T3 + T6;
ci[0] = Tc + Tf;
{
E T8, Tg, T7, T9;
T8 = T3 - T6;
Tg = Tc - Tf;
T7 = W[2];
T9 = W[3];
cr[WS(rs, 2)] = FNMS(T9, Tg, T7 * T8);
ci[WS(rs, 2)] = FMA(T9, T8, T7 * Tg);
}
{
E Tk, To, Th, Tl;
Tk = Ti - Tj;
To = Tm + Tn;
Th = W[0];
Tl = W[1];
cr[WS(rs, 1)] = FNMS(Tl, To, Th * Tk);
ci[WS(rs, 1)] = FMA(Th, To, Tl * Tk);
}
{
E Tq, Ts, Tp, Tr;
Tq = Ti + Tj;
Ts = Tn - Tm;
Tp = W[4];
Tr = W[5];
cr[WS(rs, 3)] = FNMS(Tr, Ts, Tp * Tq);
ci[WS(rs, 3)] = FMA(Tp, Ts, Tr * Tq);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hb_4) (planner *p) {
X(khc2hc_register) (p, hb_4, &desc);
}
#endif

View File

@@ -0,0 +1,274 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -dif -name hb_5 -include rdft/scalar/hb.h */
/*
* This function contains 40 FP additions, 34 FP multiplications,
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
* 27 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr;
{
E T4, Tu, T7, Tv;
T1 = cr[0];
{
E T2, T3, T5, T6;
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
Tu = T2 - T3;
T5 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = T5 + T6;
Tv = T5 - T6;
}
Tb = T4 - T7;
TM = FNMS(KP618033988, Tu, Tv);
Tw = FMA(KP618033988, Tv, Tu);
T8 = T4 + T7;
Ta = FNMS(KP250000000, T8, T1);
}
{
E Tf, To, Ti, Tp;
Tn = ci[WS(rs, 4)];
{
E Td, Te, Tg, Th;
Td = ci[WS(rs, 3)];
Te = cr[WS(rs, 4)];
Tf = Td + Te;
To = Td - Te;
Tg = ci[WS(rs, 2)];
Th = cr[WS(rs, 3)];
Ti = Tg + Th;
Tp = Tg - Th;
}
Tj = FMA(KP618033988, Ti, Tf);
TH = FNMS(KP618033988, Tf, Ti);
Ts = To - Tp;
Tq = To + Tp;
Tr = FNMS(KP250000000, Tq, Tn);
}
cr[0] = T1 + T8;
ci[0] = Tn + Tq;
{
E Tk, TA, Tx, TD, Tc, Tt;
Tc = FMA(KP559016994, Tb, Ta);
Tk = FNMS(KP951056516, Tj, Tc);
TA = FMA(KP951056516, Tj, Tc);
Tt = FMA(KP559016994, Ts, Tr);
Tx = FMA(KP951056516, Tw, Tt);
TD = FNMS(KP951056516, Tw, Tt);
{
E T9, Tl, Tm, Ty;
T9 = W[0];
Tl = T9 * Tk;
Tm = W[1];
Ty = Tm * Tk;
cr[WS(rs, 1)] = FNMS(Tm, Tx, Tl);
ci[WS(rs, 1)] = FMA(T9, Tx, Ty);
}
{
E Tz, TB, TC, TE;
Tz = W[6];
TB = Tz * TA;
TC = W[7];
TE = TC * TA;
cr[WS(rs, 4)] = FNMS(TC, TD, TB);
ci[WS(rs, 4)] = FMA(Tz, TD, TE);
}
}
{
E TI, TQ, TN, TT, TG, TL;
TG = FNMS(KP559016994, Tb, Ta);
TI = FMA(KP951056516, TH, TG);
TQ = FNMS(KP951056516, TH, TG);
TL = FNMS(KP559016994, Ts, Tr);
TN = FNMS(KP951056516, TM, TL);
TT = FMA(KP951056516, TM, TL);
{
E TF, TJ, TK, TO;
TF = W[2];
TJ = TF * TI;
TK = W[3];
TO = TK * TI;
cr[WS(rs, 2)] = FNMS(TK, TN, TJ);
ci[WS(rs, 2)] = FMA(TF, TN, TO);
}
{
E TP, TR, TS, TU;
TP = W[4];
TR = TP * TQ;
TS = W[5];
TU = TS * TQ;
cr[WS(rs, 3)] = FNMS(TS, TT, TR);
ci[WS(rs, 3)] = FMA(TP, TT, TU);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 5 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hb_5", twinstr, &GENUS, { 14, 8, 26, 0 } };
void X(codelet_hb_5) (planner *p) {
X(khc2hc_register) (p, hb_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 5 -dif -name hb_5 -include rdft/scalar/hb.h */
/*
* This function contains 40 FP additions, 28 FP multiplications,
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
* 27 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, Tj, TG, Ts, T8, Ti, T9, Tn, TD, Tu, Tg, Tt;
{
E T4, Tq, T7, Tr;
T1 = cr[0];
{
E T2, T3, T5, T6;
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
Tq = T2 - T3;
T5 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = T5 + T6;
Tr = T5 - T6;
}
Tj = KP559016994 * (T4 - T7);
TG = FMA(KP951056516, Tq, KP587785252 * Tr);
Ts = FNMS(KP951056516, Tr, KP587785252 * Tq);
T8 = T4 + T7;
Ti = FNMS(KP250000000, T8, T1);
}
{
E Tc, Tl, Tf, Tm;
T9 = ci[WS(rs, 4)];
{
E Ta, Tb, Td, Te;
Ta = ci[WS(rs, 3)];
Tb = cr[WS(rs, 4)];
Tc = Ta - Tb;
Tl = Ta + Tb;
Td = ci[WS(rs, 2)];
Te = cr[WS(rs, 3)];
Tf = Td - Te;
Tm = Td + Te;
}
Tn = FNMS(KP951056516, Tm, KP587785252 * Tl);
TD = FMA(KP951056516, Tl, KP587785252 * Tm);
Tu = KP559016994 * (Tc - Tf);
Tg = Tc + Tf;
Tt = FNMS(KP250000000, Tg, T9);
}
cr[0] = T1 + T8;
ci[0] = T9 + Tg;
{
E To, Ty, Tw, TA, Tk, Tv;
Tk = Ti - Tj;
To = Tk - Tn;
Ty = Tk + Tn;
Tv = Tt - Tu;
Tw = Ts + Tv;
TA = Tv - Ts;
{
E Th, Tp, Tx, Tz;
Th = W[2];
Tp = W[3];
cr[WS(rs, 2)] = FNMS(Tp, Tw, Th * To);
ci[WS(rs, 2)] = FMA(Th, Tw, Tp * To);
Tx = W[4];
Tz = W[5];
cr[WS(rs, 3)] = FNMS(Tz, TA, Tx * Ty);
ci[WS(rs, 3)] = FMA(Tx, TA, Tz * Ty);
}
}
{
E TE, TK, TI, TM, TC, TH;
TC = Tj + Ti;
TE = TC - TD;
TK = TC + TD;
TH = Tu + Tt;
TI = TG + TH;
TM = TH - TG;
{
E TB, TF, TJ, TL;
TB = W[0];
TF = W[1];
cr[WS(rs, 1)] = FNMS(TF, TI, TB * TE);
ci[WS(rs, 1)] = FMA(TB, TI, TF * TE);
TJ = W[6];
TL = W[7];
cr[WS(rs, 4)] = FNMS(TL, TM, TJ * TK);
ci[WS(rs, 4)] = FMA(TJ, TM, TL * TK);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 5 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hb_5", twinstr, &GENUS, { 26, 14, 14, 0 } };
void X(codelet_hb_5) (planner *p) {
X(khc2hc_register) (p, hb_5, &desc);
}
#endif

View File

@@ -0,0 +1,292 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hb_6 -include rdft/scalar/hb.h */
/*
* This function contains 46 FP additions, 32 FP multiplications,
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E Td, Tn, TO, TJ, TN, Tk, Tr, T3, TC, Ts, TQ, Ta, Tm, TF, TG;
{
E Tb, Tc, Tg, TH, Tj, TI;
Tb = ci[WS(rs, 5)];
Tc = cr[WS(rs, 3)];
Td = Tb - Tc;
{
E Te, Tf, Th, Ti;
Te = ci[WS(rs, 3)];
Tf = cr[WS(rs, 5)];
Tg = Te - Tf;
TH = Te + Tf;
Th = ci[WS(rs, 4)];
Ti = cr[WS(rs, 4)];
Tj = Th - Ti;
TI = Th + Ti;
}
Tn = Tj - Tg;
TO = TH - TI;
TJ = TH + TI;
TN = Tb + Tc;
Tk = Tg + Tj;
Tr = FNMS(KP500000000, Tk, Td);
}
{
E T6, TD, T9, TE, T1, T2;
T1 = cr[0];
T2 = ci[WS(rs, 2)];
T3 = T1 + T2;
TC = T1 - T2;
{
E T4, T5, T7, T8;
T4 = cr[WS(rs, 2)];
T5 = ci[0];
T6 = T4 + T5;
TD = T4 - T5;
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 1)];
T9 = T7 + T8;
TE = T7 - T8;
}
Ts = T6 - T9;
TQ = TD - TE;
Ta = T6 + T9;
Tm = FNMS(KP500000000, Ta, T3);
TF = TD + TE;
TG = FNMS(KP500000000, TF, TC);
}
cr[0] = T3 + Ta;
ci[0] = Td + Tk;
{
E To, Tt, Tp, Tu, Tl, Tq;
To = FNMS(KP866025403, Tn, Tm);
Tt = FNMS(KP866025403, Ts, Tr);
Tl = W[2];
Tp = Tl * To;
Tu = Tl * Tt;
Tq = W[3];
cr[WS(rs, 2)] = FNMS(Tq, Tt, Tp);
ci[WS(rs, 2)] = FMA(Tq, To, Tu);
}
{
E T13, TZ, T11, T12, T14, T10;
T13 = TN + TO;
T10 = TC + TF;
TZ = W[4];
T11 = TZ * T10;
T12 = W[5];
T14 = T12 * T10;
cr[WS(rs, 3)] = FNMS(T12, T13, T11);
ci[WS(rs, 3)] = FMA(TZ, T13, T14);
}
{
E Tw, Tz, Tx, TA, Tv, Ty;
Tw = FMA(KP866025403, Tn, Tm);
Tz = FMA(KP866025403, Ts, Tr);
Tv = W[6];
Tx = Tv * Tw;
TA = Tv * Tz;
Ty = W[7];
cr[WS(rs, 4)] = FNMS(Ty, Tz, Tx);
ci[WS(rs, 4)] = FMA(Ty, Tw, TA);
}
{
E TR, TX, TT, TV, TW, TY, TB, TL, TM, TS, TP, TU, TK;
TP = FNMS(KP500000000, TO, TN);
TR = FMA(KP866025403, TQ, TP);
TX = FNMS(KP866025403, TQ, TP);
TU = FMA(KP866025403, TJ, TG);
TT = W[8];
TV = TT * TU;
TW = W[9];
TY = TW * TU;
TK = FNMS(KP866025403, TJ, TG);
TB = W[0];
TL = TB * TK;
TM = W[1];
TS = TM * TK;
cr[WS(rs, 1)] = FNMS(TM, TR, TL);
ci[WS(rs, 1)] = FMA(TB, TR, TS);
cr[WS(rs, 5)] = FNMS(TW, TX, TV);
ci[WS(rs, 5)] = FMA(TT, TX, TY);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 6, "hb_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
void X(codelet_hb_6) (planner *p) {
X(khc2hc_register) (p, hb_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hb_6 -include rdft/scalar/hb.h */
/*
* This function contains 46 FP additions, 28 FP multiplications,
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
* 27 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T3, Ty, Ta, TO, Tr, TB, Td, TE, Tk, TL, Tn, TH;
{
E T1, T2, Tb, Tc;
T1 = cr[0];
T2 = ci[WS(rs, 2)];
T3 = T1 + T2;
Ty = T1 - T2;
{
E T6, Tz, T9, TA;
{
E T4, T5, T7, T8;
T4 = cr[WS(rs, 2)];
T5 = ci[0];
T6 = T4 + T5;
Tz = T4 - T5;
T7 = ci[WS(rs, 1)];
T8 = cr[WS(rs, 1)];
T9 = T7 + T8;
TA = T7 - T8;
}
Ta = T6 + T9;
TO = KP866025403 * (Tz - TA);
Tr = KP866025403 * (T6 - T9);
TB = Tz + TA;
}
Tb = ci[WS(rs, 5)];
Tc = cr[WS(rs, 3)];
Td = Tb - Tc;
TE = Tb + Tc;
{
E Tg, TG, Tj, TF;
{
E Te, Tf, Th, Ti;
Te = ci[WS(rs, 3)];
Tf = cr[WS(rs, 5)];
Tg = Te - Tf;
TG = Te + Tf;
Th = ci[WS(rs, 4)];
Ti = cr[WS(rs, 4)];
Tj = Th - Ti;
TF = Th + Ti;
}
Tk = Tg + Tj;
TL = KP866025403 * (TG + TF);
Tn = KP866025403 * (Tj - Tg);
TH = TF - TG;
}
}
cr[0] = T3 + Ta;
ci[0] = Td + Tk;
{
E TC, TI, Tx, TD;
TC = Ty + TB;
TI = TE - TH;
Tx = W[4];
TD = W[5];
cr[WS(rs, 3)] = FNMS(TD, TI, Tx * TC);
ci[WS(rs, 3)] = FMA(TD, TC, Tx * TI);
}
{
E To, Tu, Ts, Tw, Tm, Tq;
Tm = FNMS(KP500000000, Ta, T3);
To = Tm - Tn;
Tu = Tm + Tn;
Tq = FNMS(KP500000000, Tk, Td);
Ts = Tq - Tr;
Tw = Tr + Tq;
{
E Tl, Tp, Tt, Tv;
Tl = W[2];
Tp = W[3];
cr[WS(rs, 2)] = FNMS(Tp, Ts, Tl * To);
ci[WS(rs, 2)] = FMA(Tl, Ts, Tp * To);
Tt = W[6];
Tv = W[7];
cr[WS(rs, 4)] = FNMS(Tv, Tw, Tt * Tu);
ci[WS(rs, 4)] = FMA(Tt, Tw, Tv * Tu);
}
}
{
E TM, TS, TQ, TU, TK, TP;
TK = FNMS(KP500000000, TB, Ty);
TM = TK - TL;
TS = TK + TL;
TP = FMA(KP500000000, TH, TE);
TQ = TO + TP;
TU = TP - TO;
{
E TJ, TN, TR, TT;
TJ = W[0];
TN = W[1];
cr[WS(rs, 1)] = FNMS(TN, TQ, TJ * TM);
ci[WS(rs, 1)] = FMA(TN, TM, TJ * TQ);
TR = W[8];
TT = W[9];
cr[WS(rs, 5)] = FNMS(TT, TU, TR * TS);
ci[WS(rs, 5)] = FMA(TT, TS, TR * TU);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 6, "hb_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
void X(codelet_hb_6) (planner *p) {
X(khc2hc_register) (p, hb_6, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,356 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include rdft/scalar/hb.h */
/*
* This function contains 72 FP additions, 66 FP multiplications,
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
* 41 stack variables, 6 constants, and 28 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, T4, TC, T7, TB, Ta, TA, TD, TZ, T1l, T1b, TP, Td, Tt, Tw;
E Tv, Tu, Tp, Ty, T1j, T1e, TX, TS;
T1 = cr[0];
{
E T2, T3, T1a, TO, Tc;
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
TC = T2 - T3;
{
E T5, T6, T8, T9;
T5 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = T5 + T6;
TB = T5 - T6;
T8 = cr[WS(rs, 3)];
T9 = ci[WS(rs, 2)];
Ta = T8 + T9;
TA = T8 - T9;
}
TD = FNMS(KP554958132, TC, TB);
TZ = FMA(KP554958132, TB, TA);
T1l = FMA(KP554958132, TA, TC);
T1a = FNMS(KP356895867, T7, T4);
T1b = FNMS(KP692021471, T1a, Ta);
TO = FNMS(KP356895867, T4, Ta);
TP = FNMS(KP692021471, TO, T7);
Tc = FNMS(KP356895867, Ta, T7);
Td = FNMS(KP692021471, Tc, T4);
}
Tt = ci[WS(rs, 6)];
{
E Th, Tk, Tn, Tf, Tg;
Tf = ci[WS(rs, 3)];
Tg = cr[WS(rs, 4)];
Th = Tf + Tg;
Tw = Tf - Tg;
{
E Ti, Tj, Tl, Tm;
Ti = ci[WS(rs, 4)];
Tj = cr[WS(rs, 5)];
Tk = Ti + Tj;
Tv = Ti - Tj;
Tl = ci[WS(rs, 5)];
Tm = cr[WS(rs, 6)];
Tn = Tl + Tm;
Tu = Tl - Tm;
}
{
E To, Tx, T1i, T1d, TW, TR;
To = FNMS(KP554958132, Tn, Tk);
Tp = FNMS(KP801937735, To, Th);
Tx = FNMS(KP356895867, Tw, Tv);
Ty = FNMS(KP692021471, Tx, Tu);
T1i = FNMS(KP356895867, Tv, Tu);
T1j = FNMS(KP692021471, T1i, Tw);
T1d = FMA(KP554958132, Th, Tn);
T1e = FMA(KP801937735, T1d, Tk);
TW = FNMS(KP356895867, Tu, Tw);
TX = FNMS(KP692021471, TW, Tv);
TR = FMA(KP554958132, Tk, Th);
TS = FNMS(KP801937735, TR, Tn);
}
}
cr[0] = T1 + T4 + T7 + Ta;
ci[0] = Tt + Tu + Tv + Tw;
{
E Tq, TI, TF, TL, Te, Tz, TE;
Te = FNMS(KP900968867, Td, T1);
Tq = FNMS(KP974927912, Tp, Te);
TI = FMA(KP974927912, Tp, Te);
Tz = FNMS(KP900968867, Ty, Tt);
TE = FNMS(KP801937735, TD, TA);
TF = FMA(KP974927912, TE, Tz);
TL = FNMS(KP974927912, TE, Tz);
{
E Tb, Tr, Ts, TG;
Tb = W[4];
Tr = Tb * Tq;
Ts = W[5];
TG = Ts * Tq;
cr[WS(rs, 3)] = FNMS(Ts, TF, Tr);
ci[WS(rs, 3)] = FMA(Tb, TF, TG);
}
{
E TH, TJ, TK, TM;
TH = W[6];
TJ = TH * TI;
TK = W[7];
TM = TK * TI;
cr[WS(rs, 4)] = FNMS(TK, TL, TJ);
ci[WS(rs, 4)] = FMA(TH, TL, TM);
}
}
{
E TT, T14, T11, T17, TQ, TY, T10;
TQ = FNMS(KP900968867, TP, T1);
TT = FNMS(KP974927912, TS, TQ);
T14 = FMA(KP974927912, TS, TQ);
TY = FNMS(KP900968867, TX, Tt);
T10 = FNMS(KP801937735, TZ, TC);
T11 = FMA(KP974927912, T10, TY);
T17 = FNMS(KP974927912, T10, TY);
{
E TN, TU, TV, T12;
TN = W[2];
TU = TN * TT;
TV = W[3];
T12 = TV * TT;
cr[WS(rs, 2)] = FNMS(TV, T11, TU);
ci[WS(rs, 2)] = FMA(TN, T11, T12);
}
{
E T13, T15, T16, T18;
T13 = W[8];
T15 = T13 * T14;
T16 = W[9];
T18 = T16 * T14;
cr[WS(rs, 5)] = FNMS(T16, T17, T15);
ci[WS(rs, 5)] = FMA(T13, T17, T18);
}
}
{
E T1f, T1q, T1n, T1t, T1c, T1k, T1m;
T1c = FNMS(KP900968867, T1b, T1);
T1f = FNMS(KP974927912, T1e, T1c);
T1q = FMA(KP974927912, T1e, T1c);
T1k = FNMS(KP900968867, T1j, Tt);
T1m = FMA(KP801937735, T1l, TB);
T1n = FMA(KP974927912, T1m, T1k);
T1t = FNMS(KP974927912, T1m, T1k);
{
E T19, T1g, T1h, T1o;
T19 = W[0];
T1g = T19 * T1f;
T1h = W[1];
T1o = T1h * T1f;
cr[WS(rs, 1)] = FNMS(T1h, T1n, T1g);
ci[WS(rs, 1)] = FMA(T19, T1n, T1o);
}
{
E T1p, T1r, T1s, T1u;
T1p = W[10];
T1r = T1p * T1q;
T1s = W[11];
T1u = T1s * T1q;
cr[WS(rs, 6)] = FNMS(T1s, T1t, T1r);
ci[WS(rs, 6)] = FMA(T1p, T1t, T1u);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, { 18, 12, 54, 0 } };
void X(codelet_hb_7) (planner *p) {
X(khc2hc_register) (p, hb_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include rdft/scalar/hb.h */
/*
* This function contains 72 FP additions, 60 FP multiplications,
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
* 36 stack variables, 6 constants, and 28 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, T4, T7, Ta, Tx, TI, TV, TQ, TE, Tm, Tb, Te, Th, Tk, Tq;
E TF, TR, TU, TJ, Tt;
{
E Tu, Tw, Tv, T2, T3;
T1 = cr[0];
T2 = cr[WS(rs, 1)];
T3 = ci[0];
T4 = T2 + T3;
Tu = T2 - T3;
{
E T5, T6, T8, T9;
T5 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 1)];
T7 = T5 + T6;
Tw = T5 - T6;
T8 = cr[WS(rs, 3)];
T9 = ci[WS(rs, 2)];
Ta = T8 + T9;
Tv = T8 - T9;
}
Tx = FMA(KP433883739, Tu, KP974927912 * Tv) - (KP781831482 * Tw);
TI = FMA(KP781831482, Tu, KP974927912 * Tw) + (KP433883739 * Tv);
TV = FNMS(KP781831482, Tv, KP974927912 * Tu) - (KP433883739 * Tw);
TQ = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
TE = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
Tm = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
}
{
E Tp, Tn, To, Tc, Td;
Tb = ci[WS(rs, 6)];
Tc = ci[WS(rs, 5)];
Td = cr[WS(rs, 6)];
Te = Tc - Td;
Tp = Tc + Td;
{
E Tf, Tg, Ti, Tj;
Tf = ci[WS(rs, 4)];
Tg = cr[WS(rs, 5)];
Th = Tf - Tg;
Tn = Tf + Tg;
Ti = ci[WS(rs, 3)];
Tj = cr[WS(rs, 4)];
Tk = Ti - Tj;
To = Ti + Tj;
}
Tq = FNMS(KP974927912, To, KP781831482 * Tn) - (KP433883739 * Tp);
TF = FMA(KP781831482, Tp, KP974927912 * Tn) + (KP433883739 * To);
TR = FMA(KP433883739, Tn, KP781831482 * To) - (KP974927912 * Tp);
TU = FMA(KP623489801, Tk, Tb) + FNMA(KP900968867, Th, KP222520933 * Te);
TJ = FMA(KP623489801, Te, Tb) + FNMA(KP900968867, Tk, KP222520933 * Th);
Tt = FMA(KP623489801, Th, Tb) + FNMA(KP222520933, Tk, KP900968867 * Te);
}
cr[0] = T1 + T4 + T7 + Ta;
ci[0] = Tb + Te + Th + Tk;
{
E Tr, Ty, Tl, Ts;
Tr = Tm - Tq;
Ty = Tt - Tx;
Tl = W[6];
Ts = W[7];
cr[WS(rs, 4)] = FNMS(Ts, Ty, Tl * Tr);
ci[WS(rs, 4)] = FMA(Tl, Ty, Ts * Tr);
}
{
E TY, T10, TX, TZ;
TY = TQ + TR;
T10 = TV + TU;
TX = W[2];
TZ = W[3];
cr[WS(rs, 2)] = FNMS(TZ, T10, TX * TY);
ci[WS(rs, 2)] = FMA(TX, T10, TZ * TY);
}
{
E TA, TC, Tz, TB;
TA = Tm + Tq;
TC = Tx + Tt;
Tz = W[4];
TB = W[5];
cr[WS(rs, 3)] = FNMS(TB, TC, Tz * TA);
ci[WS(rs, 3)] = FMA(Tz, TC, TB * TA);
}
{
E TM, TO, TL, TN;
TM = TE + TF;
TO = TJ - TI;
TL = W[10];
TN = W[11];
cr[WS(rs, 6)] = FNMS(TN, TO, TL * TM);
ci[WS(rs, 6)] = FMA(TL, TO, TN * TM);
}
{
E TS, TW, TP, TT;
TS = TQ - TR;
TW = TU - TV;
TP = W[8];
TT = W[9];
cr[WS(rs, 5)] = FNMS(TT, TW, TP * TS);
ci[WS(rs, 5)] = FMA(TP, TW, TT * TS);
}
{
E TG, TK, TD, TH;
TG = TE - TF;
TK = TI + TJ;
TD = W[0];
TH = W[1];
cr[WS(rs, 1)] = FNMS(TH, TK, TD * TG);
ci[WS(rs, 1)] = FMA(TD, TK, TH * TG);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, { 36, 24, 36, 0 } };
void X(codelet_hb_7) (planner *p) {
X(khc2hc_register) (p, hb_7, &desc);
}
#endif

View File

@@ -0,0 +1,373 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include rdft/scalar/hb.h */
/*
* This function contains 66 FP additions, 36 FP multiplications,
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
* 33 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, T1i, T1n, Tk, TD, TV, T1b, TQ, Te, T1e, T1o, T1j, TE, TF, TR;
E Tv, TW;
{
E T3, Tg, TC, T19, T6, Tz, Tj, T1a;
{
E T1, T2, TA, TB;
T1 = cr[0];
T2 = ci[WS(rs, 3)];
T3 = T1 + T2;
Tg = T1 - T2;
TA = ci[WS(rs, 7)];
TB = cr[WS(rs, 4)];
TC = TA + TB;
T19 = TA - TB;
}
{
E T4, T5, Th, Ti;
T4 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 1)];
T6 = T4 + T5;
Tz = T4 - T5;
Th = ci[WS(rs, 5)];
Ti = cr[WS(rs, 6)];
Tj = Th + Ti;
T1a = Th - Ti;
}
T7 = T3 + T6;
T1i = T3 - T6;
T1n = T19 - T1a;
Tk = Tg - Tj;
TD = Tz + TC;
TV = TC - Tz;
T1b = T19 + T1a;
TQ = Tg + Tj;
}
{
E Ta, Tl, Tt, T1d, Td, Tq, To, T1c, Tp, Tu;
{
E T8, T9, Tr, Ts;
T8 = cr[WS(rs, 1)];
T9 = ci[WS(rs, 2)];
Ta = T8 + T9;
Tl = T8 - T9;
Tr = ci[WS(rs, 4)];
Ts = cr[WS(rs, 7)];
Tt = Tr + Ts;
T1d = Tr - Ts;
}
{
E Tb, Tc, Tm, Tn;
Tb = ci[0];
Tc = cr[WS(rs, 3)];
Td = Tb + Tc;
Tq = Tb - Tc;
Tm = ci[WS(rs, 6)];
Tn = cr[WS(rs, 5)];
To = Tm + Tn;
T1c = Tm - Tn;
}
Te = Ta + Td;
T1e = T1c + T1d;
T1o = Ta - Td;
T1j = T1d - T1c;
TE = Tl + To;
TF = Tq + Tt;
TR = TE + TF;
Tp = Tl - To;
Tu = Tq - Tt;
Tv = Tp + Tu;
TW = Tp - Tu;
}
cr[0] = T7 + Te;
ci[0] = T1b + T1e;
{
E TS, TX, TT, TY, TP, TU;
TS = FNMS(KP707106781, TR, TQ);
TX = FMA(KP707106781, TW, TV);
TP = W[4];
TT = TP * TS;
TY = TP * TX;
TU = W[5];
cr[WS(rs, 3)] = FNMS(TU, TX, TT);
ci[WS(rs, 3)] = FMA(TU, TS, TY);
}
{
E T1s, T1v, T1t, T1w, T1r, T1u;
T1s = T1i + T1j;
T1v = T1o + T1n;
T1r = W[2];
T1t = T1r * T1s;
T1w = T1r * T1v;
T1u = W[3];
cr[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
ci[WS(rs, 2)] = FMA(T1u, T1s, T1w);
}
{
E T10, T13, T11, T14, TZ, T12;
T10 = FMA(KP707106781, TR, TQ);
T13 = FNMS(KP707106781, TW, TV);
TZ = W[12];
T11 = TZ * T10;
T14 = TZ * T13;
T12 = W[13];
cr[WS(rs, 7)] = FNMS(T12, T13, T11);
ci[WS(rs, 7)] = FMA(T12, T10, T14);
}
{
E T1f, T15, T17, T18, T1g, T16;
T1f = T1b - T1e;
T16 = T7 - Te;
T15 = W[6];
T17 = T15 * T16;
T18 = W[7];
T1g = T18 * T16;
cr[WS(rs, 4)] = FNMS(T18, T1f, T17);
ci[WS(rs, 4)] = FMA(T15, T1f, T1g);
}
{
E T1k, T1p, T1l, T1q, T1h, T1m;
T1k = T1i - T1j;
T1p = T1n - T1o;
T1h = W[10];
T1l = T1h * T1k;
T1q = T1h * T1p;
T1m = W[11];
cr[WS(rs, 6)] = FNMS(T1m, T1p, T1l);
ci[WS(rs, 6)] = FMA(T1m, T1k, T1q);
}
{
E TH, TN, TJ, TL, TM, TO, Tf, Tx, Ty, TI, TG, TK, Tw;
TG = TE - TF;
TH = FNMS(KP707106781, TG, TD);
TN = FMA(KP707106781, TG, TD);
TK = FMA(KP707106781, Tv, Tk);
TJ = W[0];
TL = TJ * TK;
TM = W[1];
TO = TM * TK;
Tw = FNMS(KP707106781, Tv, Tk);
Tf = W[8];
Tx = Tf * Tw;
Ty = W[9];
TI = Ty * Tw;
cr[WS(rs, 5)] = FNMS(Ty, TH, Tx);
ci[WS(rs, 5)] = FMA(Tf, TH, TI);
cr[WS(rs, 1)] = FNMS(TM, TN, TL);
ci[WS(rs, 1)] = FMA(TJ, TN, TO);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
void X(codelet_hb_8) (planner *p) {
X(khc2hc_register) (p, hb_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include rdft/scalar/hb.h */
/*
* This function contains 66 FP additions, 32 FP multiplications,
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
* 30 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
E TD;
{
E T3, TK, Tn, TL, T6, TW, Tk, TX;
{
E T1, T2, Tl, Tm;
T1 = cr[0];
T2 = ci[WS(rs, 3)];
T3 = T1 + T2;
TK = T1 - T2;
Tl = ci[WS(rs, 5)];
Tm = cr[WS(rs, 6)];
Tn = Tl - Tm;
TL = Tl + Tm;
}
{
E T4, T5, Ti, Tj;
T4 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 1)];
T6 = T4 + T5;
TW = T4 - T5;
Ti = ci[WS(rs, 7)];
Tj = cr[WS(rs, 4)];
Tk = Ti - Tj;
TX = Ti + Tj;
}
T7 = T3 + T6;
T18 = TK + TL;
T1c = TX - TW;
To = Tk + Tn;
Ty = T3 - T6;
TM = TK - TL;
TY = TW + TX;
TC = Tk - Tn;
}
{
E Ta, TN, Tu, TR, Td, TQ, Tr, TO;
{
E T8, T9, Ts, Tt;
T8 = cr[WS(rs, 1)];
T9 = ci[WS(rs, 2)];
Ta = T8 + T9;
TN = T8 - T9;
Ts = ci[WS(rs, 4)];
Tt = cr[WS(rs, 7)];
Tu = Ts - Tt;
TR = Ts + Tt;
}
{
E Tb, Tc, Tp, Tq;
Tb = ci[0];
Tc = cr[WS(rs, 3)];
Td = Tb + Tc;
TQ = Tb - Tc;
Tp = ci[WS(rs, 6)];
Tq = cr[WS(rs, 5)];
Tr = Tp - Tq;
TO = Tp + Tq;
}
Te = Ta + Td;
TZ = TN + TO;
T10 = TQ + TR;
Tv = Tr + Tu;
Tz = Tu - Tr;
TP = TN - TO;
TS = TQ - TR;
TD = Ta - Td;
}
cr[0] = T7 + Te;
ci[0] = To + Tv;
{
E Tg, Tw, Tf, Th;
Tg = T7 - Te;
Tw = To - Tv;
Tf = W[6];
Th = W[7];
cr[WS(rs, 4)] = FNMS(Th, Tw, Tf * Tg);
ci[WS(rs, 4)] = FMA(Th, Tg, Tf * Tw);
}
{
E TG, TI, TF, TH;
TG = Ty + Tz;
TI = TD + TC;
TF = W[2];
TH = W[3];
cr[WS(rs, 2)] = FNMS(TH, TI, TF * TG);
ci[WS(rs, 2)] = FMA(TF, TI, TH * TG);
}
{
E TA, TE, Tx, TB;
TA = Ty - Tz;
TE = TC - TD;
Tx = W[10];
TB = W[11];
cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
}
{
E T1a, T1g, T1e, T1i, T19, T1d;
T19 = KP707106781 * (TZ + T10);
T1a = T18 - T19;
T1g = T18 + T19;
T1d = KP707106781 * (TP - TS);
T1e = T1c + T1d;
T1i = T1c - T1d;
{
E T17, T1b, T1f, T1h;
T17 = W[4];
T1b = W[5];
cr[WS(rs, 3)] = FNMS(T1b, T1e, T17 * T1a);
ci[WS(rs, 3)] = FMA(T17, T1e, T1b * T1a);
T1f = W[12];
T1h = W[13];
cr[WS(rs, 7)] = FNMS(T1h, T1i, T1f * T1g);
ci[WS(rs, 7)] = FMA(T1f, T1i, T1h * T1g);
}
}
{
E TU, T14, T12, T16, TT, T11;
TT = KP707106781 * (TP + TS);
TU = TM - TT;
T14 = TM + TT;
T11 = KP707106781 * (TZ - T10);
T12 = TY - T11;
T16 = TY + T11;
{
E TJ, TV, T13, T15;
TJ = W[8];
TV = W[9];
cr[WS(rs, 5)] = FNMS(TV, T12, TJ * TU);
ci[WS(rs, 5)] = FMA(TV, TU, TJ * T12);
T13 = W[0];
T15 = W[1];
cr[WS(rs, 1)] = FNMS(T15, T16, T13 * T14);
ci[WS(rs, 1)] = FMA(T15, T14, T13 * T16);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
void X(codelet_hb_8) (planner *p) {
X(khc2hc_register) (p, hb_8, &desc);
}
#endif

View File

@@ -0,0 +1,497 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:50 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -dif -name hb_9 -include rdft/scalar/hb.h */
/*
* This function contains 96 FP additions, 88 FP multiplications,
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
* 53 stack variables, 10 constants, and 36 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T5, Tl, TQ, T1y, T1b, T1J, Tg, TE, Tw, Tz, T1E, T1L, T1B, T1K, T14;
E T1d, TX, T1c;
{
E T1, Th, T4, T1a, Tk, TP, TO, T19;
T1 = cr[0];
Th = ci[WS(rs, 8)];
{
E T2, T3, Ti, Tj;
T2 = cr[WS(rs, 3)];
T3 = ci[WS(rs, 2)];
T4 = T2 + T3;
T1a = T2 - T3;
Ti = ci[WS(rs, 5)];
Tj = cr[WS(rs, 6)];
Tk = Ti - Tj;
TP = Ti + Tj;
}
T5 = T1 + T4;
Tl = Th + Tk;
TO = FNMS(KP500000000, T4, T1);
TQ = FNMS(KP866025403, TP, TO);
T1y = FMA(KP866025403, TP, TO);
T19 = FNMS(KP500000000, Tk, Th);
T1b = FMA(KP866025403, T1a, T19);
T1J = FNMS(KP866025403, T1a, T19);
}
{
E T6, T9, TY, T12, Tm, Tp, TZ, T11, Tb, Te, TS, TU, Tr, Tu, TR;
E TV;
{
E T7, T8, Tn, To;
T6 = cr[WS(rs, 1)];
T7 = cr[WS(rs, 4)];
T8 = ci[WS(rs, 1)];
T9 = T7 + T8;
TY = FNMS(KP500000000, T9, T6);
T12 = T7 - T8;
Tm = ci[WS(rs, 7)];
Tn = ci[WS(rs, 4)];
To = cr[WS(rs, 7)];
Tp = Tn - To;
TZ = Tn + To;
T11 = FMS(KP500000000, Tp, Tm);
}
{
E Tc, Td, Ts, Tt;
Tb = cr[WS(rs, 2)];
Tc = ci[WS(rs, 3)];
Td = ci[0];
Te = Tc + Td;
TS = Td - Tc;
TU = FNMS(KP500000000, Te, Tb);
Tr = ci[WS(rs, 6)];
Ts = cr[WS(rs, 5)];
Tt = cr[WS(rs, 8)];
Tu = Ts + Tt;
TR = FMA(KP500000000, Tu, Tr);
TV = Ts - Tt;
}
{
E Ta, Tf, T1z, T1A;
Ta = T6 + T9;
Tf = Tb + Te;
Tg = Ta + Tf;
TE = Ta - Tf;
{
E Tq, Tv, T1C, T1D;
Tq = Tm + Tp;
Tv = Tr - Tu;
Tw = Tq + Tv;
Tz = Tv - Tq;
T1C = FNMS(KP866025403, TV, TU);
T1D = FMA(KP866025403, TS, TR);
T1E = FMA(KP363970234, T1D, T1C);
T1L = FNMS(KP363970234, T1C, T1D);
}
T1z = FMA(KP866025403, T12, T11);
T1A = FMA(KP866025403, TZ, TY);
T1B = FMA(KP176326980, T1A, T1z);
T1K = FNMS(KP176326980, T1z, T1A);
{
E T10, T13, TT, TW;
T10 = FNMS(KP866025403, TZ, TY);
T13 = FNMS(KP866025403, T12, T11);
T14 = FMA(KP839099631, T13, T10);
T1d = FNMS(KP839099631, T10, T13);
TT = FNMS(KP866025403, TS, TR);
TW = FMA(KP866025403, TV, TU);
TX = FNMS(KP176326980, TW, TT);
T1c = FMA(KP176326980, TT, TW);
}
}
}
cr[0] = T5 + Tg;
ci[0] = Tl + Tw;
{
E TA, TI, TF, TL, Ty, TD;
Ty = FNMS(KP500000000, Tg, T5);
TA = FNMS(KP866025403, Tz, Ty);
TI = FMA(KP866025403, Tz, Ty);
TD = FNMS(KP500000000, Tw, Tl);
TF = FNMS(KP866025403, TE, TD);
TL = FMA(KP866025403, TE, TD);
{
E TB, TG, Tx, TC;
Tx = W[10];
TB = Tx * TA;
TG = Tx * TF;
TC = W[11];
cr[WS(rs, 6)] = FNMS(TC, TF, TB);
ci[WS(rs, 6)] = FMA(TC, TA, TG);
}
{
E TJ, TM, TH, TK;
TH = W[4];
TJ = TH * TI;
TM = TH * TL;
TK = W[5];
cr[WS(rs, 3)] = FNMS(TK, TL, TJ);
ci[WS(rs, 3)] = FMA(TK, TI, TM);
}
}
{
E T16, T1s, T1k, T1f, T1v, T1p;
{
E T1j, T15, T1i, T1o, T1e, T1n;
T1j = FMA(KP777861913, T1d, T1c);
T15 = FNMS(KP777861913, T14, TX);
T1i = FMA(KP492403876, T15, TQ);
T16 = FNMS(KP984807753, T15, TQ);
T1s = FMA(KP852868531, T1j, T1i);
T1k = FNMS(KP852868531, T1j, T1i);
T1o = FMA(KP777861913, T14, TX);
T1e = FNMS(KP777861913, T1d, T1c);
T1n = FNMS(KP492403876, T1e, T1b);
T1f = FMA(KP984807753, T1e, T1b);
T1v = FMA(KP852868531, T1o, T1n);
T1p = FNMS(KP852868531, T1o, T1n);
}
{
E TN, T17, T18, T1g;
TN = W[0];
T17 = TN * T16;
T18 = W[1];
T1g = T18 * T16;
cr[WS(rs, 1)] = FNMS(T18, T1f, T17);
ci[WS(rs, 1)] = FMA(TN, T1f, T1g);
}
{
E T1t, T1w, T1r, T1u;
T1r = W[6];
T1t = T1r * T1s;
T1w = T1r * T1v;
T1u = W[7];
cr[WS(rs, 4)] = FNMS(T1u, T1v, T1t);
ci[WS(rs, 4)] = FMA(T1u, T1s, T1w);
}
{
E T1l, T1q, T1h, T1m;
T1h = W[12];
T1l = T1h * T1k;
T1q = T1h * T1p;
T1m = W[13];
cr[WS(rs, 7)] = FNMS(T1m, T1p, T1l);
ci[WS(rs, 7)] = FMA(T1m, T1k, T1q);
}
}
{
E T1W, T1N, T1V, T1G, T20, T1S;
T1W = FMA(KP954188894, T1E, T1B);
{
E T1M, T1R, T1F, T1Q;
T1M = FNMS(KP954188894, T1L, T1K);
T1N = FMA(KP984807753, T1M, T1J);
T1V = FNMS(KP492403876, T1M, T1J);
T1R = FMA(KP954188894, T1L, T1K);
T1F = FNMS(KP954188894, T1E, T1B);
T1Q = FNMS(KP492403876, T1F, T1y);
T1G = FMA(KP984807753, T1F, T1y);
T20 = FMA(KP852868531, T1R, T1Q);
T1S = FNMS(KP852868531, T1R, T1Q);
}
{
E T1H, T1O, T1x, T1I;
T1x = W[2];
T1H = T1x * T1G;
T1O = T1x * T1N;
T1I = W[3];
cr[WS(rs, 2)] = FNMS(T1I, T1N, T1H);
ci[WS(rs, 2)] = FMA(T1I, T1G, T1O);
}
{
E T23, T22, T24, T1Z, T21;
T23 = FNMS(KP852868531, T1W, T1V);
T22 = W[15];
T24 = T22 * T20;
T1Z = W[14];
T21 = T1Z * T20;
cr[WS(rs, 8)] = FNMS(T22, T23, T21);
ci[WS(rs, 8)] = FMA(T1Z, T23, T24);
}
{
E T1X, T1U, T1Y, T1P, T1T;
T1X = FMA(KP852868531, T1W, T1V);
T1U = W[9];
T1Y = T1U * T1S;
T1P = W[8];
T1T = T1P * T1S;
cr[WS(rs, 5)] = FNMS(T1U, T1X, T1T);
ci[WS(rs, 5)] = FMA(T1P, T1X, T1Y);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 9 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 9, "hb_9", twinstr, &GENUS, { 24, 16, 72, 0 } };
void X(codelet_hb_9) (planner *p) {
X(khc2hc_register) (p, hb_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -dif -name hb_9 -include rdft/scalar/hb.h */
/*
* This function contains 96 FP additions, 72 FP multiplications,
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
* 53 stack variables, 8 constants, and 36 memory accesses
*/
#include "rdft/scalar/hb.h"
static void hb_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T5, Tl, TM, T1o, T16, T1y, Ta, Tf, Tg, Tq, Tv, Tw, TT, T17, T1u;
E T1A, T1r, T1z, T10, T18;
{
E T1, Th, T4, T14, Tk, TL, TK, T15;
T1 = cr[0];
Th = ci[WS(rs, 8)];
{
E T2, T3, Ti, Tj;
T2 = cr[WS(rs, 3)];
T3 = ci[WS(rs, 2)];
T4 = T2 + T3;
T14 = KP866025403 * (T2 - T3);
Ti = ci[WS(rs, 5)];
Tj = cr[WS(rs, 6)];
Tk = Ti - Tj;
TL = KP866025403 * (Ti + Tj);
}
T5 = T1 + T4;
Tl = Th + Tk;
TK = FNMS(KP500000000, T4, T1);
TM = TK - TL;
T1o = TK + TL;
T15 = FNMS(KP500000000, Tk, Th);
T16 = T14 + T15;
T1y = T15 - T14;
}
{
E T6, T9, TN, TQ, Tm, Tp, TO, TR, Tb, Te, TU, TX, Tr, Tu, TV;
E TY;
{
E T7, T8, Tn, To;
T6 = cr[WS(rs, 1)];
T7 = cr[WS(rs, 4)];
T8 = ci[WS(rs, 1)];
T9 = T7 + T8;
TN = FNMS(KP500000000, T9, T6);
TQ = KP866025403 * (T7 - T8);
Tm = ci[WS(rs, 7)];
Tn = ci[WS(rs, 4)];
To = cr[WS(rs, 7)];
Tp = Tn - To;
TO = KP866025403 * (Tn + To);
TR = FNMS(KP500000000, Tp, Tm);
}
{
E Tc, Td, Ts, Tt;
Tb = cr[WS(rs, 2)];
Tc = ci[WS(rs, 3)];
Td = ci[0];
Te = Tc + Td;
TU = FNMS(KP500000000, Te, Tb);
TX = KP866025403 * (Tc - Td);
Tr = ci[WS(rs, 6)];
Ts = cr[WS(rs, 5)];
Tt = cr[WS(rs, 8)];
Tu = Ts + Tt;
TV = KP866025403 * (Ts - Tt);
TY = FMA(KP500000000, Tu, Tr);
}
{
E TP, TS, T1s, T1t;
Ta = T6 + T9;
Tf = Tb + Te;
Tg = Ta + Tf;
Tq = Tm + Tp;
Tv = Tr - Tu;
Tw = Tq + Tv;
TP = TN - TO;
TS = TQ + TR;
TT = FNMS(KP642787609, TS, KP766044443 * TP);
T17 = FMA(KP766044443, TS, KP642787609 * TP);
T1s = TU - TV;
T1t = TY - TX;
T1u = FMA(KP939692620, T1s, KP342020143 * T1t);
T1A = FNMS(KP939692620, T1t, KP342020143 * T1s);
{
E T1p, T1q, TW, TZ;
T1p = TN + TO;
T1q = TR - TQ;
T1r = FNMS(KP984807753, T1q, KP173648177 * T1p);
T1z = FMA(KP173648177, T1q, KP984807753 * T1p);
TW = TU + TV;
TZ = TX + TY;
T10 = FNMS(KP984807753, TZ, KP173648177 * TW);
T18 = FMA(KP984807753, TW, KP173648177 * TZ);
}
}
}
cr[0] = T5 + Tg;
ci[0] = Tl + Tw;
{
E TA, TG, TE, TI;
{
E Ty, Tz, TC, TD;
Ty = FNMS(KP500000000, Tg, T5);
Tz = KP866025403 * (Tv - Tq);
TA = Ty - Tz;
TG = Ty + Tz;
TC = FNMS(KP500000000, Tw, Tl);
TD = KP866025403 * (Ta - Tf);
TE = TC - TD;
TI = TD + TC;
}
{
E Tx, TB, TF, TH;
Tx = W[10];
TB = W[11];
cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
TF = W[4];
TH = W[5];
cr[WS(rs, 3)] = FNMS(TH, TI, TF * TG);
ci[WS(rs, 3)] = FMA(TF, TI, TH * TG);
}
}
{
E T1d, T1h, T12, T1c, T1a, T1g, T11, T19, TJ, T13;
T1d = KP866025403 * (T18 - T17);
T1h = KP866025403 * (TT - T10);
T11 = TT + T10;
T12 = TM + T11;
T1c = FNMS(KP500000000, T11, TM);
T19 = T17 + T18;
T1a = T16 + T19;
T1g = FNMS(KP500000000, T19, T16);
TJ = W[0];
T13 = W[1];
cr[WS(rs, 1)] = FNMS(T13, T1a, TJ * T12);
ci[WS(rs, 1)] = FMA(T13, T12, TJ * T1a);
{
E T1k, T1m, T1j, T1l;
T1k = T1c + T1d;
T1m = T1h + T1g;
T1j = W[6];
T1l = W[7];
cr[WS(rs, 4)] = FNMS(T1l, T1m, T1j * T1k);
ci[WS(rs, 4)] = FMA(T1j, T1m, T1l * T1k);
}
{
E T1e, T1i, T1b, T1f;
T1e = T1c - T1d;
T1i = T1g - T1h;
T1b = W[12];
T1f = W[13];
cr[WS(rs, 7)] = FNMS(T1f, T1i, T1b * T1e);
ci[WS(rs, 7)] = FMA(T1b, T1i, T1f * T1e);
}
}
{
E T1F, T1J, T1w, T1E, T1C, T1I, T1v, T1B, T1n, T1x;
T1F = KP866025403 * (T1A - T1z);
T1J = KP866025403 * (T1r + T1u);
T1v = T1r - T1u;
T1w = T1o + T1v;
T1E = FNMS(KP500000000, T1v, T1o);
T1B = T1z + T1A;
T1C = T1y + T1B;
T1I = FNMS(KP500000000, T1B, T1y);
T1n = W[2];
T1x = W[3];
cr[WS(rs, 2)] = FNMS(T1x, T1C, T1n * T1w);
ci[WS(rs, 2)] = FMA(T1n, T1C, T1x * T1w);
{
E T1M, T1O, T1L, T1N;
T1M = T1F + T1E;
T1O = T1I + T1J;
T1L = W[8];
T1N = W[9];
cr[WS(rs, 5)] = FNMS(T1N, T1O, T1L * T1M);
ci[WS(rs, 5)] = FMA(T1N, T1M, T1L * T1O);
}
{
E T1G, T1K, T1D, T1H;
T1G = T1E - T1F;
T1K = T1I - T1J;
T1D = W[14];
T1H = W[15];
cr[WS(rs, 8)] = FNMS(T1H, T1K, T1D * T1G);
ci[WS(rs, 8)] = FMA(T1H, T1G, T1D * T1K);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 9 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 9, "hb_9", twinstr, &GENUS, { 60, 36, 36, 0 } };
void X(codelet_hb_9) (planner *p) {
X(khc2hc_register) (p, hb_9, &desc);
}
#endif

View File

@@ -0,0 +1,858 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:09 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 196 FP additions, 134 FP multiplications,
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
* 93 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
E Tv, Tw, T2z, T2C, TB, TF, Ty, Tz, T1V, TA, T2G, T3Q, T3C, T3g, T3L;
E T30, T3m, T3z, T3w, T3s, T1X, T1Y, T2u, T2c, T2p, TE, TG, T1G, T1o, T1D;
{
E T3f, T3l, T2F, T3r, T2Z, T3v, TD, Tx;
Tv = W[0];
Tw = W[2];
Tx = Tv * Tw;
T2z = W[6];
T3f = Tv * T2z;
T2C = W[7];
T3l = Tv * T2C;
TB = W[4];
T2F = Tv * TB;
T3r = Tw * TB;
TF = W[5];
T2Z = Tv * TF;
T3v = Tw * TF;
Ty = W[1];
Tz = W[3];
TD = Tv * Tz;
T1V = FMA(Ty, Tz, Tx);
TA = FNMS(Ty, Tz, Tx);
T2G = FNMS(Ty, TF, T2F);
T3Q = FMA(Tz, TB, T3v);
T3C = FNMS(Ty, TB, T2Z);
T3g = FMA(Ty, T2C, T3f);
T3L = FNMS(Tz, TF, T3r);
T30 = FMA(Ty, TB, T2Z);
T3m = FNMS(Ty, T2z, T3l);
T3z = FMA(Ty, TF, T2F);
T3w = FNMS(Tz, TB, T3v);
T3s = FMA(Tz, TF, T3r);
{
E T1W, T2b, TC, T1n;
T1W = T1V * TB;
T2b = T1V * TF;
T1X = FNMS(Ty, Tw, TD);
T1Y = FNMS(T1X, TF, T1W);
T2u = FNMS(T1X, TB, T2b);
T2c = FMA(T1X, TB, T2b);
T2p = FMA(T1X, TF, T1W);
TC = TA * TB;
T1n = TA * TF;
TE = FMA(Ty, Tw, TD);
TG = FNMS(TE, TF, TC);
T1G = FNMS(TE, TB, T1n);
T1o = FMA(TE, TB, T1n);
T1D = FMA(TE, TF, TC);
}
}
{
E TL, T1Z, T2d, T1t, T31, T34, T3n, T3D, T3E, T3R, T1w, T20, Tf, T3M, T2L;
E T3h, TW, T2e, T3G, T3H, T3N, T2Q, T36, T2V, T37, Tu, T3S, T18, T1z, T24;
E T2g, T27, T2h, T1j, T1y;
{
E T3, TH, T1s, T32, T6, T1p, TK, T33, Ta, TM, TP, T2J, Td, TR, TU;
E T2I;
{
E T1, T2, T1q, T1r;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
TH = T1 - T2;
T1q = Ip[0];
T1r = Im[WS(rs, 7)];
T1s = T1q + T1r;
T32 = T1q - T1r;
}
{
E T4, T5, TI, TJ;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
T1p = T4 - T5;
TI = Ip[WS(rs, 4)];
TJ = Im[WS(rs, 3)];
TK = TI + TJ;
T33 = TI - TJ;
}
{
E T8, T9, TN, TO;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
Ta = T8 + T9;
TM = T8 - T9;
TN = Ip[WS(rs, 2)];
TO = Im[WS(rs, 5)];
TP = TN + TO;
T2J = TN - TO;
}
{
E Tb, Tc, TS, TT;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
Td = Tb + Tc;
TR = Tb - Tc;
TS = Ip[WS(rs, 6)];
TT = Im[WS(rs, 1)];
TU = TS + TT;
T2I = TS - TT;
}
TL = TH - TK;
T1Z = TH + TK;
T2d = T1s - T1p;
T1t = T1p + T1s;
T31 = Ta - Td;
T34 = T32 - T33;
T3n = T34 - T31;
{
E T1u, T1v, T7, Te;
T3D = T32 + T33;
T3E = T2J + T2I;
T3R = T3D - T3E;
T1u = TM + TP;
T1v = TR + TU;
T1w = T1u - T1v;
T20 = T1u + T1v;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T3M = T7 - Te;
{
E T2H, T2K, TQ, TV;
T2H = T3 - T6;
T2K = T2I - T2J;
T2L = T2H + T2K;
T3h = T2H - T2K;
TQ = TM - TP;
TV = TR - TU;
TW = TQ + TV;
T2e = TQ - TV;
}
}
}
{
E Ti, T1e, T1c, T2N, Tl, T19, T1h, T2O, Tp, T13, T11, T2S, Ts, TY, T16;
E T2T, T2M, T2P;
{
E Tg, Th, T1a, T1b;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
T1e = Tg - Th;
T1a = Ip[WS(rs, 1)];
T1b = Im[WS(rs, 6)];
T1c = T1a + T1b;
T2N = T1a - T1b;
}
{
E Tj, Tk, T1f, T1g;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
T19 = Tj - Tk;
T1f = Ip[WS(rs, 5)];
T1g = Im[WS(rs, 2)];
T1h = T1f + T1g;
T2O = T1f - T1g;
}
{
E Tn, To, TZ, T10;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T13 = Tn - To;
TZ = Ip[WS(rs, 7)];
T10 = Im[0];
T11 = TZ + T10;
T2S = TZ - T10;
}
{
E Tq, Tr, T14, T15;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
TY = Tq - Tr;
T14 = Ip[WS(rs, 3)];
T15 = Im[WS(rs, 4)];
T16 = T14 + T15;
T2T = T14 - T15;
}
T3G = T2N + T2O;
T3H = T2S + T2T;
T3N = T3H - T3G;
T2M = Ti - Tl;
T2P = T2N - T2O;
T2Q = T2M - T2P;
T36 = T2M + T2P;
{
E T2R, T2U, Tm, Tt;
T2R = Tp - Ts;
T2U = T2S - T2T;
T2V = T2R + T2U;
T37 = T2U - T2R;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T3S = Tm - Tt;
}
{
E T12, T17, T22, T23;
T12 = TY - T11;
T17 = T13 - T16;
T18 = FNMS(KP414213562, T17, T12);
T1z = FMA(KP414213562, T12, T17);
T22 = T1c - T19;
T23 = T1e + T1h;
T24 = FNMS(KP414213562, T23, T22);
T2g = FMA(KP414213562, T22, T23);
}
{
E T25, T26, T1d, T1i;
T25 = TY + T11;
T26 = T13 + T16;
T27 = FNMS(KP414213562, T26, T25);
T2h = FMA(KP414213562, T25, T26);
T1d = T19 + T1c;
T1i = T1e - T1h;
T1j = FMA(KP414213562, T1i, T1d);
T1y = FNMS(KP414213562, T1d, T1i);
}
}
Rp[0] = Tf + Tu;
{
E T3B, T3K, T3F, T3I, T3J, T3A;
T3A = Tf - Tu;
T3B = T3z * T3A;
T3K = T3C * T3A;
T3F = T3D + T3E;
T3I = T3G + T3H;
T3J = T3F - T3I;
Rm[0] = T3F + T3I;
Rm[WS(rs, 4)] = FMA(T3z, T3J, T3K);
Rp[WS(rs, 4)] = FNMS(T3C, T3J, T3B);
}
{
E T3O, T3P, T3T, T3U;
T3O = T3M - T3N;
T3P = T3L * T3O;
T3T = T3R - T3S;
T3U = T3L * T3T;
Rp[WS(rs, 6)] = FNMS(T3Q, T3T, T3P);
Rm[WS(rs, 6)] = FMA(T3Q, T3O, T3U);
}
{
E T3V, T3W, T3X, T3Y;
T3V = T3M + T3N;
T3W = TA * T3V;
T3X = T3S + T3R;
T3Y = TA * T3X;
Rp[WS(rs, 2)] = FNMS(TE, T3X, T3W);
Rm[WS(rs, 2)] = FMA(TE, T3V, T3Y);
}
{
E T3j, T3t, T3p, T3x, T3i, T3o;
T3i = T37 - T36;
T3j = FNMS(KP707106781, T3i, T3h);
T3t = FMA(KP707106781, T3i, T3h);
T3o = T2Q - T2V;
T3p = FNMS(KP707106781, T3o, T3n);
T3x = FMA(KP707106781, T3o, T3n);
{
E T3k, T3q, T3u, T3y;
T3k = T3g * T3j;
Rp[WS(rs, 7)] = FNMS(T3m, T3p, T3k);
T3q = T3g * T3p;
Rm[WS(rs, 7)] = FMA(T3m, T3j, T3q);
T3u = T3s * T3t;
Rp[WS(rs, 3)] = FNMS(T3w, T3x, T3u);
T3y = T3s * T3x;
Rm[WS(rs, 3)] = FMA(T3w, T3t, T3y);
}
}
{
E T2X, T3b, T39, T3d, T2W, T35, T38;
T2W = T2Q + T2V;
T2X = FNMS(KP707106781, T2W, T2L);
T3b = FMA(KP707106781, T2W, T2L);
T35 = T31 + T34;
T38 = T36 + T37;
T39 = FNMS(KP707106781, T38, T35);
T3d = FMA(KP707106781, T38, T35);
{
E T2Y, T3a, T3c, T3e;
T2Y = T2G * T2X;
Rp[WS(rs, 5)] = FNMS(T30, T39, T2Y);
T3a = T30 * T2X;
Rm[WS(rs, 5)] = FMA(T2G, T39, T3a);
T3c = T1V * T3b;
Rp[WS(rs, 1)] = FNMS(T1X, T3d, T3c);
T3e = T1X * T3b;
Rm[WS(rs, 1)] = FMA(T1V, T3d, T3e);
}
}
{
E T29, T2l, T2j, T2n;
{
E T21, T28, T2f, T2i;
T21 = FNMS(KP707106781, T20, T1Z);
T28 = T24 + T27;
T29 = FMA(KP923879532, T28, T21);
T2l = FNMS(KP923879532, T28, T21);
T2f = FMA(KP707106781, T2e, T2d);
T2i = T2g - T2h;
T2j = FNMS(KP923879532, T2i, T2f);
T2n = FMA(KP923879532, T2i, T2f);
}
{
E T2a, T2k, T2m, T2o;
T2a = T1Y * T29;
Ip[WS(rs, 5)] = FNMS(T2c, T2j, T2a);
T2k = T2c * T29;
Im[WS(rs, 5)] = FMA(T1Y, T2j, T2k);
T2m = Tw * T2l;
Ip[WS(rs, 1)] = FNMS(Tz, T2n, T2m);
T2o = Tz * T2l;
Im[WS(rs, 1)] = FMA(Tw, T2n, T2o);
}
}
{
E T1l, T1E, T1B, T1H;
{
E TX, T1k, T1x, T1A;
TX = FNMS(KP707106781, TW, TL);
T1k = T18 - T1j;
T1l = FNMS(KP923879532, T1k, TX);
T1E = FMA(KP923879532, T1k, TX);
T1x = FNMS(KP707106781, T1w, T1t);
T1A = T1y - T1z;
T1B = FNMS(KP923879532, T1A, T1x);
T1H = FMA(KP923879532, T1A, T1x);
}
{
E T1m, T1C, T1F, T1I;
T1m = TG * T1l;
Ip[WS(rs, 6)] = FNMS(T1o, T1B, T1m);
T1C = T1o * T1l;
Im[WS(rs, 6)] = FMA(TG, T1B, T1C);
T1F = T1D * T1E;
Ip[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
T1I = T1G * T1E;
Im[WS(rs, 2)] = FMA(T1D, T1H, T1I);
}
}
{
E T2s, T2A, T2x, T2D;
{
E T2q, T2r, T2v, T2w;
T2q = FMA(KP707106781, T20, T1Z);
T2r = T2g + T2h;
T2s = FNMS(KP923879532, T2r, T2q);
T2A = FMA(KP923879532, T2r, T2q);
T2v = FNMS(KP707106781, T2e, T2d);
T2w = T27 - T24;
T2x = FMA(KP923879532, T2w, T2v);
T2D = FNMS(KP923879532, T2w, T2v);
}
{
E T2t, T2y, T2B, T2E;
T2t = T2p * T2s;
Ip[WS(rs, 3)] = FNMS(T2u, T2x, T2t);
T2y = T2p * T2x;
Im[WS(rs, 3)] = FMA(T2u, T2s, T2y);
T2B = T2z * T2A;
Ip[WS(rs, 7)] = FNMS(T2C, T2D, T2B);
T2E = T2z * T2D;
Im[WS(rs, 7)] = FMA(T2C, T2A, T2E);
}
}
{
E T1L, T1R, T1P, T1T;
{
E T1J, T1K, T1N, T1O;
T1J = FMA(KP707106781, TW, TL);
T1K = T1y + T1z;
T1L = FNMS(KP923879532, T1K, T1J);
T1R = FMA(KP923879532, T1K, T1J);
T1N = FMA(KP707106781, T1w, T1t);
T1O = T1j + T18;
T1P = FNMS(KP923879532, T1O, T1N);
T1T = FMA(KP923879532, T1O, T1N);
}
{
E T1M, T1Q, T1S, T1U;
T1M = TB * T1L;
Ip[WS(rs, 4)] = FNMS(TF, T1P, T1M);
T1Q = TB * T1P;
Im[WS(rs, 4)] = FMA(TF, T1L, T1Q);
T1S = Tv * T1R;
Ip[0] = FNMS(Ty, T1T, T1S);
T1U = Tv * T1T;
Im[0] = FMA(Ty, T1R, T1U);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
void X(codelet_hc2cb2_16) (planner *p) {
X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 196 FP additions, 108 FP multiplications,
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
* 80 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
{
E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
{
E T1m, T1s, T1o, T1r;
Tv = W[0];
Ty = W[1];
T1l = W[2];
T1n = W[3];
T1m = Tv * T1l;
T1s = Ty * T1l;
T1o = Ty * T1n;
T1r = Tv * T1n;
T1p = T1m + T1o;
T1t = T1r - T1s;
T27 = T1r + T1s;
T25 = T1m - T1o;
Tz = W[5];
TA = Ty * Tz;
T1J = T1l * Tz;
T15 = Tv * Tz;
T1G = T1n * Tz;
Tw = W[4];
Tx = Tv * Tw;
T1K = T1n * Tw;
T16 = Ty * Tw;
T1F = T1l * Tw;
}
TB = Tx - TA;
T21 = T1J + T1K;
T1P = T15 - T16;
T1H = T1F + T1G;
T1X = T1F - T1G;
T17 = T15 + T16;
T1L = T1J - T1K;
T1N = Tx + TA;
T1v = W[6];
T1w = W[7];
T1x = FMA(Tv, T1v, Ty * T1w);
T1B = FNMS(Ty, T1v, Tv * T1w);
{
E T2D, T2E, T29, T2a;
T2D = T25 * Tz;
T2E = T27 * Tw;
T2F = T2D + T2E;
T2T = T2D - T2E;
T29 = T25 * Tw;
T2a = T27 * Tz;
T2b = T29 - T2a;
T2R = T29 + T2a;
}
{
E T3h, T3i, T33, T34;
T3h = T1p * Tz;
T3i = T1t * Tw;
T3j = T3h + T3i;
T3x = T3h - T3i;
T33 = T1p * Tw;
T34 = T1t * Tz;
T35 = T33 - T34;
T3t = T33 + T34;
}
}
{
E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
E T3e, T3o;
{
E T3, T2c, T1b, T2H, T6, T2G, T1e, T2d;
{
E T1, T2, T19, T1a;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
T2c = T1 - T2;
T19 = Ip[0];
T1a = Im[WS(rs, 7)];
T1b = T19 - T1a;
T2H = T19 + T1a;
}
{
E T4, T5, T1c, T1d;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
T2G = T4 - T5;
T1c = Ip[WS(rs, 4)];
T1d = Im[WS(rs, 3)];
T1e = T1c - T1d;
T2d = T1c + T1d;
}
T7 = T3 + T6;
T36 = T2c + T2d;
T3k = T2H - T2G;
TC = T3 - T6;
T1f = T1b - T1e;
T2e = T2c - T2d;
T2I = T2G + T2H;
T1Q = T1b + T1e;
}
{
E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
{
E T8, T9, TG, TH;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
Ta = T8 + T9;
T2f = T8 - T9;
TG = Ip[WS(rs, 2)];
TH = Im[WS(rs, 5)];
TI = TG - TH;
T2g = TG + TH;
}
{
E Tb, Tc, TD, TE;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
Td = Tb + Tc;
T2i = Tb - Tc;
TD = Ip[WS(rs, 6)];
TE = Im[WS(rs, 1)];
TF = TD - TE;
T2j = TD + TE;
}
Te = Ta + Td;
TJ = TF - TI;
T1R = TI + TF;
T18 = Ta - Td;
{
E T2J, T2K, T2h, T2k;
T2J = T2f + T2g;
T2K = T2i + T2j;
T2L = KP707106781 * (T2J - T2K);
T37 = KP707106781 * (T2J + T2K);
T2h = T2f - T2g;
T2k = T2i - T2j;
T2l = KP707106781 * (T2h + T2k);
T3l = KP707106781 * (T2h - T2k);
}
}
{
E Ti, T2x, TO, T2v, Tl, T2u, TR, T2y, TL, TS;
{
E Tg, Th, TM, TN;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
T2x = Tg - Th;
TM = Ip[WS(rs, 1)];
TN = Im[WS(rs, 6)];
TO = TM - TN;
T2v = TM + TN;
}
{
E Tj, Tk, TP, TQ;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
T2u = Tj - Tk;
TP = Ip[WS(rs, 5)];
TQ = Im[WS(rs, 2)];
TR = TP - TQ;
T2y = TP + TQ;
}
Tm = Ti + Tl;
T1T = TO + TR;
TL = Ti - Tl;
TS = TO - TR;
TT = TL - TS;
T1h = TL + TS;
{
E T2w, T2z, T39, T3a;
T2w = T2u + T2v;
T2z = T2x - T2y;
T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
T39 = T2x + T2y;
T3a = T2v - T2u;
T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
T3n = FMA(KP382683432, T3a, KP923879532 * T39);
}
}
{
E Tp, T2q, TX, T2o, Ts, T2n, T10, T2r, TU, T11;
{
E Tn, To, TV, TW;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T2q = Tn - To;
TV = Ip[WS(rs, 7)];
TW = Im[0];
TX = TV - TW;
T2o = TV + TW;
}
{
E Tq, Tr, TY, TZ;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
T2n = Tq - Tr;
TY = Ip[WS(rs, 3)];
TZ = Im[WS(rs, 4)];
T10 = TY - TZ;
T2r = TY + TZ;
}
Tt = Tp + Ts;
T1U = TX + T10;
TU = Tp - Ts;
T11 = TX - T10;
T12 = TU + T11;
T1i = T11 - TU;
{
E T2p, T2s, T3c, T3d;
T2p = T2n - T2o;
T2s = T2q - T2r;
T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
T3c = T2q + T2r;
T3d = T2n + T2o;
T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
}
}
{
E Tf, Tu, T1O, T1S, T1V, T1W;
Tf = T7 + Te;
Tu = Tm + Tt;
T1O = Tf - Tu;
T1S = T1Q + T1R;
T1V = T1T + T1U;
T1W = T1S - T1V;
Rp[0] = Tf + Tu;
Rm[0] = T1S + T1V;
Rp[WS(rs, 4)] = FNMS(T1P, T1W, T1N * T1O);
Rm[WS(rs, 4)] = FMA(T1P, T1O, T1N * T1W);
}
{
E T3g, T3r, T3q, T3s;
{
E T38, T3f, T3m, T3p;
T38 = T36 - T37;
T3f = T3b + T3e;
T3g = T38 - T3f;
T3r = T38 + T3f;
T3m = T3k + T3l;
T3p = T3n - T3o;
T3q = T3m - T3p;
T3s = T3m + T3p;
}
Ip[WS(rs, 5)] = FNMS(T3j, T3q, T35 * T3g);
Im[WS(rs, 5)] = FMA(T3j, T3g, T35 * T3q);
Ip[WS(rs, 1)] = FNMS(T1n, T3s, T1l * T3r);
Im[WS(rs, 1)] = FMA(T1n, T3r, T1l * T3s);
}
{
E T3w, T3B, T3A, T3C;
{
E T3u, T3v, T3y, T3z;
T3u = T36 + T37;
T3v = T3n + T3o;
T3w = T3u - T3v;
T3B = T3u + T3v;
T3y = T3k - T3l;
T3z = T3b - T3e;
T3A = T3y + T3z;
T3C = T3y - T3z;
}
Ip[WS(rs, 3)] = FNMS(T3x, T3A, T3t * T3w);
Im[WS(rs, 3)] = FMA(T3t, T3A, T3x * T3w);
Ip[WS(rs, 7)] = FNMS(T1w, T3C, T1v * T3B);
Im[WS(rs, 7)] = FMA(T1v, T3C, T1w * T3B);
}
{
E T14, T1q, T1k, T1u;
{
E TK, T13, T1g, T1j;
TK = TC + TJ;
T13 = KP707106781 * (TT + T12);
T14 = TK - T13;
T1q = TK + T13;
T1g = T18 + T1f;
T1j = KP707106781 * (T1h + T1i);
T1k = T1g - T1j;
T1u = T1g + T1j;
}
Rp[WS(rs, 5)] = FNMS(T17, T1k, TB * T14);
Rm[WS(rs, 5)] = FMA(T17, T14, TB * T1k);
Rp[WS(rs, 1)] = FNMS(T1t, T1u, T1p * T1q);
Rm[WS(rs, 1)] = FMA(T1t, T1q, T1p * T1u);
}
{
E T1A, T1I, T1E, T1M;
{
E T1y, T1z, T1C, T1D;
T1y = TC - TJ;
T1z = KP707106781 * (T1i - T1h);
T1A = T1y - T1z;
T1I = T1y + T1z;
T1C = T1f - T18;
T1D = KP707106781 * (TT - T12);
T1E = T1C - T1D;
T1M = T1C + T1D;
}
Rp[WS(rs, 7)] = FNMS(T1B, T1E, T1x * T1A);
Rm[WS(rs, 7)] = FMA(T1x, T1E, T1B * T1A);
Rp[WS(rs, 3)] = FNMS(T1L, T1M, T1H * T1I);
Rm[WS(rs, 3)] = FMA(T1H, T1M, T1L * T1I);
}
{
E T2C, T2S, T2Q, T2U;
{
E T2m, T2B, T2M, T2P;
T2m = T2e - T2l;
T2B = T2t - T2A;
T2C = T2m - T2B;
T2S = T2m + T2B;
T2M = T2I - T2L;
T2P = T2N - T2O;
T2Q = T2M - T2P;
T2U = T2M + T2P;
}
Ip[WS(rs, 6)] = FNMS(T2F, T2Q, T2b * T2C);
Im[WS(rs, 6)] = FMA(T2F, T2C, T2b * T2Q);
Ip[WS(rs, 2)] = FNMS(T2T, T2U, T2R * T2S);
Im[WS(rs, 2)] = FMA(T2T, T2S, T2R * T2U);
}
{
E T2X, T31, T30, T32;
{
E T2V, T2W, T2Y, T2Z;
T2V = T2e + T2l;
T2W = T2N + T2O;
T2X = T2V - T2W;
T31 = T2V + T2W;
T2Y = T2I + T2L;
T2Z = T2A + T2t;
T30 = T2Y - T2Z;
T32 = T2Y + T2Z;
}
Ip[WS(rs, 4)] = FNMS(Tz, T30, Tw * T2X);
Im[WS(rs, 4)] = FMA(Tw, T30, Tz * T2X);
Ip[0] = FNMS(Ty, T32, Tv * T31);
Im[0] = FMA(Tv, T32, Ty * T31);
}
{
E T20, T26, T24, T28;
{
E T1Y, T1Z, T22, T23;
T1Y = T7 - Te;
T1Z = T1U - T1T;
T20 = T1Y - T1Z;
T26 = T1Y + T1Z;
T22 = T1Q - T1R;
T23 = Tm - Tt;
T24 = T22 - T23;
T28 = T23 + T22;
}
Rp[WS(rs, 6)] = FNMS(T21, T24, T1X * T20);
Rm[WS(rs, 6)] = FMA(T1X, T24, T21 * T20);
Rp[WS(rs, 2)] = FNMS(T27, T28, T25 * T26);
Rm[WS(rs, 2)] = FMA(T25, T28, T27 * T26);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
void X(codelet_hc2cb2_16) (planner *p) {
X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,194 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:09 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hc2cb2_4 -include rdft/scalar/hc2cb.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 33 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, Tb, T8, Ta, Tc, Tg, T9, Tf;
T7 = W[0];
Tb = W[3];
T8 = W[2];
T9 = T7 * T8;
Tf = T7 * Tb;
Ta = W[1];
Tc = FMA(Ta, Tb, T9);
Tg = FNMS(Ta, T8, Tf);
{
E T3, T6, Td, Tj, Tz, Tx, Tr, Tm, Tv, Ts, Tw, TA;
{
E Th, Ti, Tu, Tp, Tk, Tl, Tq, Tt;
{
E T1, T2, T4, T5;
Th = Ip[0];
Ti = Im[WS(rs, 1)];
Tu = Th + Ti;
T1 = Rp[0];
T2 = Rm[WS(rs, 1)];
T3 = T1 + T2;
Tp = T1 - T2;
Tk = Ip[WS(rs, 1)];
Tl = Im[0];
Tq = Tk + Tl;
T4 = Rp[WS(rs, 1)];
T5 = Rm[0];
T6 = T4 + T5;
Tt = T4 - T5;
}
Td = T3 - T6;
Tj = Th - Ti;
Tz = Tu - Tt;
Tx = Tp + Tq;
Tr = Tp - Tq;
Tm = Tk - Tl;
Tv = Tt + Tu;
}
Rp[0] = T3 + T6;
Rm[0] = Tj + Tm;
Ts = T7 * Tr;
Ip[0] = FNMS(Ta, Tv, Ts);
Tw = T7 * Tv;
Im[0] = FMA(Ta, Tr, Tw);
TA = T8 * Tz;
Im[WS(rs, 1)] = FMA(Tb, Tx, TA);
{
E Ty, Te, To, Tn;
Ty = T8 * Tx;
Ip[WS(rs, 1)] = FNMS(Tb, Tz, Ty);
Te = Tc * Td;
To = Tg * Td;
Tn = Tj - Tm;
Rp[WS(rs, 1)] = FNMS(Tg, Tn, Te);
Rm[WS(rs, 1)] = FMA(Tc, Tn, To);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hc2cb2_4) (planner *p) {
X(khc2c_register) (p, hc2cb2_4, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 4 -dif -name hc2cb2_4 -include rdft/scalar/hc2cb.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, T9, T8, Ta, Tb, Td;
T7 = W[0];
T9 = W[1];
T8 = W[2];
Ta = W[3];
Tb = FMA(T7, T8, T9 * Ta);
Td = FNMS(T9, T8, T7 * Ta);
{
E T3, Tl, Tg, Tp, T6, To, Tj, Tm, Tc, Tk;
{
E T1, T2, Te, Tf;
T1 = Rp[0];
T2 = Rm[WS(rs, 1)];
T3 = T1 + T2;
Tl = T1 - T2;
Te = Ip[0];
Tf = Im[WS(rs, 1)];
Tg = Te - Tf;
Tp = Te + Tf;
}
{
E T4, T5, Th, Ti;
T4 = Rp[WS(rs, 1)];
T5 = Rm[0];
T6 = T4 + T5;
To = T4 - T5;
Th = Ip[WS(rs, 1)];
Ti = Im[0];
Tj = Th - Ti;
Tm = Th + Ti;
}
Rp[0] = T3 + T6;
Rm[0] = Tg + Tj;
Tc = T3 - T6;
Tk = Tg - Tj;
Rp[WS(rs, 1)] = FNMS(Td, Tk, Tb * Tc);
Rm[WS(rs, 1)] = FMA(Td, Tc, Tb * Tk);
{
E Tn, Tq, Tr, Ts;
Tn = Tl - Tm;
Tq = To + Tp;
Ip[0] = FNMS(T9, Tq, T7 * Tn);
Im[0] = FMA(T7, Tq, T9 * Tn);
Tr = Tl + Tm;
Ts = Tp - To;
Ip[WS(rs, 1)] = FNMS(Ta, Ts, T8 * Tr);
Im[WS(rs, 1)] = FMA(T8, Ts, Ta * Tr);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cb2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hc2cb2_4) (planner *p) {
X(khc2c_register) (p, hc2cb2_4, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,387 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:09 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include rdft/scalar/hc2cb.h */
/*
* This function contains 74 FP additions, 50 FP multiplications,
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
* 47 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
E Tf, Tg, Tl, Tp, Ti, Tj, Tk, T1b, T1u, T1e, T1o, To, Tq, TK;
{
E Th, T1n, T1t, Tn, Tm, TJ;
Tf = W[0];
Tg = W[2];
Th = Tf * Tg;
Tl = W[4];
T1n = Tf * Tl;
Tp = W[5];
T1t = Tf * Tp;
Ti = W[1];
Tj = W[3];
Tn = Tf * Tj;
Tk = FMA(Ti, Tj, Th);
T1b = FNMS(Ti, Tj, Th);
T1u = FNMS(Ti, Tl, T1t);
T1e = FMA(Ti, Tg, Tn);
T1o = FMA(Ti, Tp, T1n);
Tm = Tk * Tl;
TJ = Tk * Tp;
To = FNMS(Ti, Tg, Tn);
Tq = FMA(To, Tp, Tm);
TK = FNMS(To, Tl, TJ);
}
{
E T7, T1p, T1v, Tv, TP, T13, T1h, TZ, Te, T1k, T1w, T1q, TQ, TR, T10;
E TG, T14;
{
E T3, Tr, TO, T1f, T6, TL, Tu, T1g;
{
E T1, T2, TM, TN;
T1 = Rp[0];
T2 = Rm[WS(rs, 3)];
T3 = T1 + T2;
Tr = T1 - T2;
TM = Ip[0];
TN = Im[WS(rs, 3)];
TO = TM + TN;
T1f = TM - TN;
}
{
E T4, T5, Ts, Tt;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 1)];
T6 = T4 + T5;
TL = T4 - T5;
Ts = Ip[WS(rs, 2)];
Tt = Im[WS(rs, 1)];
Tu = Ts + Tt;
T1g = Ts - Tt;
}
T7 = T3 + T6;
T1p = T3 - T6;
T1v = T1f - T1g;
Tv = Tr - Tu;
TP = TL + TO;
T13 = TO - TL;
T1h = T1f + T1g;
TZ = Tr + Tu;
}
{
E Ta, Tw, Tz, T1i, Td, TB, TE, T1j, TA, TF;
{
E T8, T9, Tx, Ty;
T8 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 2)];
Ta = T8 + T9;
Tw = T8 - T9;
Tx = Ip[WS(rs, 1)];
Ty = Im[WS(rs, 2)];
Tz = Tx + Ty;
T1i = Tx - Ty;
}
{
E Tb, Tc, TC, TD;
Tb = Rm[0];
Tc = Rp[WS(rs, 3)];
Td = Tb + Tc;
TB = Tb - Tc;
TC = Ip[WS(rs, 3)];
TD = Im[0];
TE = TC + TD;
T1j = TC - TD;
}
Te = Ta + Td;
T1k = T1i + T1j;
T1w = Ta - Td;
T1q = T1j - T1i;
TQ = Tw + Tz;
TR = TB + TE;
T10 = TQ + TR;
TA = Tw - Tz;
TF = TB - TE;
TG = TA + TF;
T14 = TA - TF;
}
Rp[0] = T7 + Te;
Rm[0] = T1h + T1k;
{
E T11, T12, T15, T16;
T11 = FNMS(KP707106781, T10, TZ);
T12 = Tg * T11;
T15 = FMA(KP707106781, T14, T13);
T16 = Tg * T15;
Ip[WS(rs, 1)] = FNMS(Tj, T15, T12);
Im[WS(rs, 1)] = FMA(Tj, T11, T16);
}
{
E T1z, T1A, T1B, T1C;
T1z = T1p + T1q;
T1A = Tk * T1z;
T1B = T1w + T1v;
T1C = Tk * T1B;
Rp[WS(rs, 1)] = FNMS(To, T1B, T1A);
Rm[WS(rs, 1)] = FMA(To, T1z, T1C);
}
{
E T17, T18, T19, T1a;
T17 = FMA(KP707106781, T10, TZ);
T18 = Tl * T17;
T19 = FNMS(KP707106781, T14, T13);
T1a = Tl * T19;
Ip[WS(rs, 3)] = FNMS(Tp, T19, T18);
Im[WS(rs, 3)] = FMA(Tp, T17, T1a);
}
{
E T1l, T1d, T1m, T1c;
T1l = T1h - T1k;
T1c = T7 - Te;
T1d = T1b * T1c;
T1m = T1e * T1c;
Rp[WS(rs, 2)] = FNMS(T1e, T1l, T1d);
Rm[WS(rs, 2)] = FMA(T1b, T1l, T1m);
}
{
E T1r, T1s, T1x, T1y;
T1r = T1p - T1q;
T1s = T1o * T1r;
T1x = T1v - T1w;
T1y = T1o * T1x;
Rp[WS(rs, 3)] = FNMS(T1u, T1x, T1s);
Rm[WS(rs, 3)] = FMA(T1u, T1r, T1y);
}
{
E TT, TX, TW, TY, TI, TU, TS, TV, TH;
TS = TQ - TR;
TT = FNMS(KP707106781, TS, TP);
TX = FMA(KP707106781, TS, TP);
TV = FMA(KP707106781, TG, Tv);
TW = Tf * TV;
TY = Ti * TV;
TH = FNMS(KP707106781, TG, Tv);
TI = Tq * TH;
TU = TK * TH;
Ip[WS(rs, 2)] = FNMS(TK, TT, TI);
Im[WS(rs, 2)] = FMA(Tq, TT, TU);
Ip[0] = FNMS(Ti, TX, TW);
Im[0] = FMA(Tf, TX, TY);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
void X(codelet_hc2cb2_8) (planner *p) {
X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include rdft/scalar/hc2cb.h */
/*
* This function contains 74 FP additions, 44 FP multiplications,
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
* 46 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
{
E Th, To, Tk, Tn;
Tf = W[0];
Ti = W[1];
Tg = W[2];
Tj = W[3];
Th = Tf * Tg;
To = Ti * Tg;
Tk = Ti * Tj;
Tn = Tf * Tj;
Tl = Th - Tk;
Tp = Tn + To;
TP = Th + Tk;
TR = Tn - To;
TF = W[4];
TG = W[5];
TH = FMA(Tf, TF, Ti * TG);
T15 = FNMS(TR, TF, TP * TG);
TL = FNMS(Ti, TF, Tf * TG);
TT = FMA(TP, TF, TR * TG);
}
{
E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
E TN, Tm, TE;
{
E T3, TU, Ts, T17, T6, T16, Tv, TV;
{
E T1, T2, Tq, Tr;
T1 = Rp[0];
T2 = Rm[WS(rs, 3)];
T3 = T1 + T2;
TU = T1 - T2;
Tq = Ip[0];
Tr = Im[WS(rs, 3)];
Ts = Tq - Tr;
T17 = Tq + Tr;
}
{
E T4, T5, Tt, Tu;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 1)];
T6 = T4 + T5;
T16 = T4 - T5;
Tt = Ip[WS(rs, 2)];
Tu = Im[WS(rs, 1)];
Tv = Tt - Tu;
TV = Tt + Tu;
}
T7 = T3 + T6;
T1f = TU + TV;
T1i = T17 - T16;
Tw = Ts + Tv;
TI = T3 - T6;
TW = TU - TV;
T18 = T16 + T17;
TM = Ts - Tv;
}
{
E Ta, TX, Tz, TY, Td, T10, TC, T11;
{
E T8, T9, Tx, Ty;
T8 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 2)];
Ta = T8 + T9;
TX = T8 - T9;
Tx = Ip[WS(rs, 1)];
Ty = Im[WS(rs, 2)];
Tz = Tx - Ty;
TY = Tx + Ty;
}
{
E Tb, Tc, TA, TB;
Tb = Rm[0];
Tc = Rp[WS(rs, 3)];
Td = Tb + Tc;
T10 = Tb - Tc;
TA = Ip[WS(rs, 3)];
TB = Im[0];
TC = TA - TB;
T11 = TA + TB;
}
Te = Ta + Td;
T19 = TX + TY;
T1a = T10 + T11;
TD = Tz + TC;
TJ = TC - Tz;
TZ = TX - TY;
T12 = T10 - T11;
TN = Ta - Td;
}
Rp[0] = T7 + Te;
Rm[0] = Tw + TD;
Tm = T7 - Te;
TE = Tw - TD;
Rp[WS(rs, 2)] = FNMS(Tp, TE, Tl * Tm);
Rm[WS(rs, 2)] = FMA(Tp, Tm, Tl * TE);
{
E TQ, TS, TK, TO;
TQ = TI + TJ;
TS = TN + TM;
Rp[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
Rm[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
TK = TI - TJ;
TO = TM - TN;
Rp[WS(rs, 3)] = FNMS(TL, TO, TH * TK);
Rm[WS(rs, 3)] = FMA(TH, TO, TL * TK);
}
{
E T1h, T1l, T1k, T1m, T1g, T1j;
T1g = KP707106781 * (T19 + T1a);
T1h = T1f - T1g;
T1l = T1f + T1g;
T1j = KP707106781 * (TZ - T12);
T1k = T1i + T1j;
T1m = T1i - T1j;
Ip[WS(rs, 1)] = FNMS(Tj, T1k, Tg * T1h);
Im[WS(rs, 1)] = FMA(Tg, T1k, Tj * T1h);
Ip[WS(rs, 3)] = FNMS(TG, T1m, TF * T1l);
Im[WS(rs, 3)] = FMA(TF, T1m, TG * T1l);
}
{
E T14, T1d, T1c, T1e, T13, T1b;
T13 = KP707106781 * (TZ + T12);
T14 = TW - T13;
T1d = TW + T13;
T1b = KP707106781 * (T19 - T1a);
T1c = T18 - T1b;
T1e = T18 + T1b;
Ip[WS(rs, 2)] = FNMS(T15, T1c, TT * T14);
Im[WS(rs, 2)] = FMA(T15, T14, TT * T1c);
Ip[0] = FNMS(Ti, T1e, Tf * T1d);
Im[0] = FMA(Ti, T1d, Tf * T1e);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
void X(codelet_hc2cb2_8) (planner *p) {
X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,513 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cb_10 -include rdft/scalar/hc2cb.h */
/*
* This function contains 102 FP additions, 72 FP multiplications,
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
* 47 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E TH, T1B, TB, T11, T1E, T1G, TK, TM, T1x, T1V, T3, T1g, Tl, T1I, T1J;
E TO, TP, T1p, Ti, Tk, T1n, T1o, TF, TG;
TF = Ip[0];
TG = Im[WS(rs, 4)];
TH = TF - TG;
T1B = TF + TG;
{
E Tp, T1u, Tz, T1s, Ts, T1v, Tw, T1r;
{
E Tn, To, Tx, Ty;
Tn = Ip[WS(rs, 4)];
To = Im[0];
Tp = Tn - To;
T1u = Tn + To;
Tx = Ip[WS(rs, 3)];
Ty = Im[WS(rs, 1)];
Tz = Tx - Ty;
T1s = Tx + Ty;
}
{
E Tq, Tr, Tu, Tv;
Tq = Ip[WS(rs, 1)];
Tr = Im[WS(rs, 3)];
Ts = Tq - Tr;
T1v = Tq + Tr;
Tu = Ip[WS(rs, 2)];
Tv = Im[WS(rs, 2)];
Tw = Tu - Tv;
T1r = Tu + Tv;
}
{
E Tt, TA, T1C, T1D;
Tt = Tp - Ts;
TA = Tw - Tz;
TB = FNMS(KP618033988, TA, Tt);
T11 = FMA(KP618033988, Tt, TA);
T1C = T1r - T1s;
T1D = T1u - T1v;
T1E = T1C + T1D;
T1G = T1C - T1D;
}
{
E TI, TJ, T1t, T1w;
TI = Tw + Tz;
TJ = Tp + Ts;
TK = TI + TJ;
TM = TI - TJ;
T1t = T1r + T1s;
T1w = T1u + T1v;
T1x = FMA(KP618033988, T1w, T1t);
T1V = FNMS(KP618033988, T1t, T1w);
}
}
{
E Td, T1k, Tg, T1l, Th, T1m, T6, T1h, T9, T1i, Ta, T1j, T1, T2;
T1 = Rp[0];
T2 = Rm[WS(rs, 4)];
T3 = T1 + T2;
T1g = T1 - T2;
{
E Tb, Tc, Te, Tf;
Tb = Rp[WS(rs, 4)];
Tc = Rm[0];
Td = Tb + Tc;
T1k = Tb - Tc;
Te = Rm[WS(rs, 3)];
Tf = Rp[WS(rs, 1)];
Tg = Te + Tf;
T1l = Te - Tf;
}
Th = Td + Tg;
T1m = T1k + T1l;
{
E T4, T5, T7, T8;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 2)];
T6 = T4 + T5;
T1h = T4 - T5;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 3)];
T9 = T7 + T8;
T1i = T7 - T8;
}
Ta = T6 + T9;
T1j = T1h + T1i;
Tl = Ta - Th;
T1I = T1h - T1i;
T1J = T1k - T1l;
TO = Td - Tg;
TP = T6 - T9;
T1p = T1j - T1m;
Ti = Ta + Th;
Tk = FNMS(KP250000000, Ti, T3);
T1n = T1j + T1m;
T1o = FNMS(KP250000000, T1n, T1g);
}
Rp[0] = T3 + Ti;
Rm[0] = TH + TK;
{
E T2d, T29, T2b, T2c, T2e, T2a;
T2d = T1B + T1E;
T2a = T1g + T1n;
T29 = W[8];
T2b = T29 * T2a;
T2c = W[9];
T2e = T2c * T2a;
Ip[WS(rs, 2)] = FNMS(T2c, T2d, T2b);
Im[WS(rs, 2)] = FMA(T29, T2d, T2e);
}
{
E TQ, T16, TC, TU, TN, T15, T12, T1a, Tm, TL, T10;
TQ = FNMS(KP618033988, TP, TO);
T16 = FMA(KP618033988, TO, TP);
Tm = FNMS(KP559016994, Tl, Tk);
TC = FMA(KP951056516, TB, Tm);
TU = FNMS(KP951056516, TB, Tm);
TL = FNMS(KP250000000, TK, TH);
TN = FNMS(KP559016994, TM, TL);
T15 = FMA(KP559016994, TM, TL);
T10 = FMA(KP559016994, Tl, Tk);
T12 = FMA(KP951056516, T11, T10);
T1a = FNMS(KP951056516, T11, T10);
{
E TR, TE, TS, Tj, TD;
TR = FNMS(KP951056516, TQ, TN);
TE = W[3];
TS = TE * TC;
Tj = W[2];
TD = Tj * TC;
Rp[WS(rs, 1)] = FNMS(TE, TR, TD);
Rm[WS(rs, 1)] = FMA(Tj, TR, TS);
}
{
E T1d, T1c, T1e, T19, T1b;
T1d = FMA(KP951056516, T16, T15);
T1c = W[11];
T1e = T1c * T1a;
T19 = W[10];
T1b = T19 * T1a;
Rp[WS(rs, 3)] = FNMS(T1c, T1d, T1b);
Rm[WS(rs, 3)] = FMA(T19, T1d, T1e);
}
{
E TX, TW, TY, TT, TV;
TX = FMA(KP951056516, TQ, TN);
TW = W[15];
TY = TW * TU;
TT = W[14];
TV = TT * TU;
Rp[WS(rs, 4)] = FNMS(TW, TX, TV);
Rm[WS(rs, 4)] = FMA(TT, TX, TY);
}
{
E T17, T14, T18, TZ, T13;
T17 = FNMS(KP951056516, T16, T15);
T14 = W[7];
T18 = T14 * T12;
TZ = W[6];
T13 = TZ * T12;
Rp[WS(rs, 2)] = FNMS(T14, T17, T13);
Rm[WS(rs, 2)] = FMA(TZ, T17, T18);
}
}
{
E T1K, T20, T1y, T1O, T1H, T1Z, T1W, T24, T1q, T1F, T1U;
T1K = FMA(KP618033988, T1J, T1I);
T20 = FNMS(KP618033988, T1I, T1J);
T1q = FMA(KP559016994, T1p, T1o);
T1y = FNMS(KP951056516, T1x, T1q);
T1O = FMA(KP951056516, T1x, T1q);
T1F = FNMS(KP250000000, T1E, T1B);
T1H = FMA(KP559016994, T1G, T1F);
T1Z = FNMS(KP559016994, T1G, T1F);
T1U = FNMS(KP559016994, T1p, T1o);
T1W = FNMS(KP951056516, T1V, T1U);
T24 = FMA(KP951056516, T1V, T1U);
{
E T1L, T1A, T1M, T1f, T1z;
T1L = FMA(KP951056516, T1K, T1H);
T1A = W[1];
T1M = T1A * T1y;
T1f = W[0];
T1z = T1f * T1y;
Ip[0] = FNMS(T1A, T1L, T1z);
Im[0] = FMA(T1f, T1L, T1M);
}
{
E T27, T26, T28, T23, T25;
T27 = FNMS(KP951056516, T20, T1Z);
T26 = W[13];
T28 = T26 * T24;
T23 = W[12];
T25 = T23 * T24;
Ip[WS(rs, 3)] = FNMS(T26, T27, T25);
Im[WS(rs, 3)] = FMA(T23, T27, T28);
}
{
E T1R, T1Q, T1S, T1N, T1P;
T1R = FNMS(KP951056516, T1K, T1H);
T1Q = W[17];
T1S = T1Q * T1O;
T1N = W[16];
T1P = T1N * T1O;
Ip[WS(rs, 4)] = FNMS(T1Q, T1R, T1P);
Im[WS(rs, 4)] = FMA(T1N, T1R, T1S);
}
{
E T21, T1Y, T22, T1T, T1X;
T21 = FMA(KP951056516, T20, T1Z);
T1Y = W[5];
T22 = T1Y * T1W;
T1T = W[4];
T1X = T1T * T1W;
Ip[WS(rs, 1)] = FNMS(T1Y, T21, T1X);
Im[WS(rs, 1)] = FMA(T1T, T21, T22);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cb_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
void X(codelet_hc2cb_10) (planner *p) {
X(khc2c_register) (p, hc2cb_10, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cb_10 -include rdft/scalar/hc2cb.h */
/*
* This function contains 102 FP additions, 60 FP multiplications,
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
* 39 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E T3, T18, TJ, T1i, TE, TF, T1B, T1A, T1f, T1t, Ti, Tl, Tt, TA, T1w;
E T1v, T1p, T1E, TM, TO;
{
E T1, T2, TH, TI;
T1 = Rp[0];
T2 = Rm[WS(rs, 4)];
T3 = T1 + T2;
T18 = T1 - T2;
TH = Ip[0];
TI = Im[WS(rs, 4)];
TJ = TH - TI;
T1i = TH + TI;
}
{
E T6, T19, Tg, T1d, T9, T1a, Td, T1c;
{
E T4, T5, Te, Tf;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 2)];
T6 = T4 + T5;
T19 = T4 - T5;
Te = Rm[WS(rs, 3)];
Tf = Rp[WS(rs, 1)];
Tg = Te + Tf;
T1d = Te - Tf;
}
{
E T7, T8, Tb, Tc;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 3)];
T9 = T7 + T8;
T1a = T7 - T8;
Tb = Rp[WS(rs, 4)];
Tc = Rm[0];
Td = Tb + Tc;
T1c = Tb - Tc;
}
TE = T6 - T9;
TF = Td - Tg;
T1B = T1c - T1d;
T1A = T19 - T1a;
{
E T1b, T1e, Ta, Th;
T1b = T19 + T1a;
T1e = T1c + T1d;
T1f = T1b + T1e;
T1t = KP559016994 * (T1b - T1e);
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
Tl = KP559016994 * (Ta - Th);
}
}
{
E Tp, T1j, Tz, T1n, Ts, T1k, Tw, T1m;
{
E Tn, To, Tx, Ty;
Tn = Ip[WS(rs, 2)];
To = Im[WS(rs, 2)];
Tp = Tn - To;
T1j = Tn + To;
Tx = Ip[WS(rs, 1)];
Ty = Im[WS(rs, 3)];
Tz = Tx - Ty;
T1n = Tx + Ty;
}
{
E Tq, Tr, Tu, Tv;
Tq = Ip[WS(rs, 3)];
Tr = Im[WS(rs, 1)];
Ts = Tq - Tr;
T1k = Tq + Tr;
Tu = Ip[WS(rs, 4)];
Tv = Im[0];
Tw = Tu - Tv;
T1m = Tu + Tv;
}
Tt = Tp - Ts;
TA = Tw - Tz;
T1w = T1m + T1n;
T1v = T1j + T1k;
{
E T1l, T1o, TK, TL;
T1l = T1j - T1k;
T1o = T1m - T1n;
T1p = T1l + T1o;
T1E = KP559016994 * (T1l - T1o);
TK = Tp + Ts;
TL = Tw + Tz;
TM = TK + TL;
TO = KP559016994 * (TK - TL);
}
}
Rp[0] = T3 + Ti;
Rm[0] = TJ + TM;
{
E T1g, T1q, T17, T1h;
T1g = T18 + T1f;
T1q = T1i + T1p;
T17 = W[8];
T1h = W[9];
Ip[WS(rs, 2)] = FNMS(T1h, T1q, T17 * T1g);
Im[WS(rs, 2)] = FMA(T1h, T1g, T17 * T1q);
}
{
E TB, TG, T11, TX, TP, T10, Tm, TW, TN, Tk;
TB = FNMS(KP951056516, TA, KP587785252 * Tt);
TG = FNMS(KP951056516, TF, KP587785252 * TE);
T11 = FMA(KP951056516, TE, KP587785252 * TF);
TX = FMA(KP951056516, Tt, KP587785252 * TA);
TN = FNMS(KP250000000, TM, TJ);
TP = TN - TO;
T10 = TO + TN;
Tk = FNMS(KP250000000, Ti, T3);
Tm = Tk - Tl;
TW = Tl + Tk;
{
E TC, TQ, Tj, TD;
TC = Tm - TB;
TQ = TG + TP;
Tj = W[2];
TD = W[3];
Rp[WS(rs, 1)] = FNMS(TD, TQ, Tj * TC);
Rm[WS(rs, 1)] = FMA(TD, TC, Tj * TQ);
}
{
E T14, T16, T13, T15;
T14 = TW - TX;
T16 = T11 + T10;
T13 = W[10];
T15 = W[11];
Rp[WS(rs, 3)] = FNMS(T15, T16, T13 * T14);
Rm[WS(rs, 3)] = FMA(T15, T14, T13 * T16);
}
{
E TS, TU, TR, TT;
TS = Tm + TB;
TU = TP - TG;
TR = W[14];
TT = W[15];
Rp[WS(rs, 4)] = FNMS(TT, TU, TR * TS);
Rm[WS(rs, 4)] = FMA(TT, TS, TR * TU);
}
{
E TY, T12, TV, TZ;
TY = TW + TX;
T12 = T10 - T11;
TV = W[6];
TZ = W[7];
Rp[WS(rs, 2)] = FNMS(TZ, T12, TV * TY);
Rm[WS(rs, 2)] = FMA(TZ, TY, TV * T12);
}
}
{
E T1x, T1C, T1Q, T1N, T1F, T1R, T1u, T1M, T1D, T1s;
T1x = FNMS(KP951056516, T1w, KP587785252 * T1v);
T1C = FNMS(KP951056516, T1B, KP587785252 * T1A);
T1Q = FMA(KP951056516, T1A, KP587785252 * T1B);
T1N = FMA(KP951056516, T1v, KP587785252 * T1w);
T1D = FNMS(KP250000000, T1p, T1i);
T1F = T1D - T1E;
T1R = T1E + T1D;
T1s = FNMS(KP250000000, T1f, T18);
T1u = T1s - T1t;
T1M = T1t + T1s;
{
E T1y, T1G, T1r, T1z;
T1y = T1u - T1x;
T1G = T1C + T1F;
T1r = W[12];
T1z = W[13];
Ip[WS(rs, 3)] = FNMS(T1z, T1G, T1r * T1y);
Im[WS(rs, 3)] = FMA(T1r, T1G, T1z * T1y);
}
{
E T1U, T1W, T1T, T1V;
T1U = T1M + T1N;
T1W = T1R - T1Q;
T1T = W[16];
T1V = W[17];
Ip[WS(rs, 4)] = FNMS(T1V, T1W, T1T * T1U);
Im[WS(rs, 4)] = FMA(T1T, T1W, T1V * T1U);
}
{
E T1I, T1K, T1H, T1J;
T1I = T1u + T1x;
T1K = T1F - T1C;
T1H = W[4];
T1J = W[5];
Ip[WS(rs, 1)] = FNMS(T1J, T1K, T1H * T1I);
Im[WS(rs, 1)] = FMA(T1H, T1K, T1J * T1I);
}
{
E T1O, T1S, T1L, T1P;
T1O = T1M - T1N;
T1S = T1Q + T1R;
T1L = W[0];
T1P = W[1];
Ip[0] = FNMS(T1P, T1S, T1L * T1O);
Im[0] = FMA(T1L, T1S, T1P * T1O);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cb_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
void X(codelet_hc2cb_10) (planner *p) {
X(khc2c_register) (p, hc2cb_10, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,597 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include rdft/scalar/hc2cb.h */
/*
* This function contains 118 FP additions, 68 FP multiplications,
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E T18, T20, T1b, T21, T1s, T2a, T1p, T29, TI, TN, TO, Tb, To, T1f, T23;
E T1i, T24, T1z, T2d, T1w, T2c, Tt, Ty, Tz, Tm, TD;
{
E T1, TE, T6, TM, T4, T1o, TH, T17, T9, T1r, TL, T1a;
T1 = Rp[0];
TE = Ip[0];
T6 = Rm[WS(rs, 5)];
TM = Im[WS(rs, 5)];
{
E T2, T3, TF, TG;
T2 = Rp[WS(rs, 4)];
T3 = Rm[WS(rs, 3)];
T4 = T2 + T3;
T1o = T2 - T3;
TF = Ip[WS(rs, 4)];
TG = Im[WS(rs, 3)];
TH = TF - TG;
T17 = TF + TG;
}
{
E T7, T8, TJ, TK;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 2)];
T9 = T7 + T8;
T1r = T7 - T8;
TJ = Ip[WS(rs, 2)];
TK = Im[WS(rs, 1)];
TL = TJ - TK;
T1a = TJ + TK;
}
{
E T16, T19, T1q, T1n, T5, Ta;
T16 = FNMS(KP500000000, T4, T1);
T18 = FNMS(KP866025403, T17, T16);
T20 = FMA(KP866025403, T17, T16);
T19 = FNMS(KP500000000, T9, T6);
T1b = FMA(KP866025403, T1a, T19);
T21 = FNMS(KP866025403, T1a, T19);
T1q = FMA(KP500000000, TL, TM);
T1s = FNMS(KP866025403, T1r, T1q);
T2a = FMA(KP866025403, T1r, T1q);
T1n = FNMS(KP500000000, TH, TE);
T1p = FMA(KP866025403, T1o, T1n);
T29 = FNMS(KP866025403, T1o, T1n);
TI = TE + TH;
TN = TL - TM;
TO = TI - TN;
T5 = T1 + T4;
Ta = T6 + T9;
Tb = T5 + Ta;
To = T5 - Ta;
}
}
{
E Tc, Tp, Th, Tx, Tf, T1v, Ts, T1e, Tk, T1y, Tw, T1h;
Tc = Rp[WS(rs, 3)];
Tp = Ip[WS(rs, 3)];
Th = Rm[WS(rs, 2)];
Tx = Im[WS(rs, 2)];
{
E Td, Te, Tq, Tr;
Td = Rm[WS(rs, 4)];
Te = Rm[0];
Tf = Td + Te;
T1v = Td - Te;
Tq = Im[WS(rs, 4)];
Tr = Im[0];
Ts = Tq + Tr;
T1e = Tq - Tr;
}
{
E Ti, Tj, Tu, Tv;
Ti = Rp[WS(rs, 1)];
Tj = Rp[WS(rs, 5)];
Tk = Ti + Tj;
T1y = Ti - Tj;
Tu = Ip[WS(rs, 1)];
Tv = Ip[WS(rs, 5)];
Tw = Tu + Tv;
T1h = Tv - Tu;
}
{
E T1d, T1g, T1x, T1u, Tg, Tl;
T1d = FNMS(KP500000000, Tf, Tc);
T1f = FMA(KP866025403, T1e, T1d);
T23 = FNMS(KP866025403, T1e, T1d);
T1g = FNMS(KP500000000, Tk, Th);
T1i = FMA(KP866025403, T1h, T1g);
T24 = FNMS(KP866025403, T1h, T1g);
T1x = FMA(KP500000000, Tw, Tx);
T1z = FNMS(KP866025403, T1y, T1x);
T2d = FMA(KP866025403, T1y, T1x);
T1u = FMA(KP500000000, Ts, Tp);
T1w = FMA(KP866025403, T1v, T1u);
T2c = FNMS(KP866025403, T1v, T1u);
Tt = Tp - Ts;
Ty = Tw - Tx;
Tz = Tt - Ty;
Tg = Tc + Tf;
Tl = Th + Tk;
Tm = Tg + Tl;
TD = Tg - Tl;
}
}
Rp[0] = Tb + Tm;
{
E TA, TP, TB, TQ, Tn, TC;
TA = To - Tz;
TP = TD + TO;
Tn = W[16];
TB = Tn * TA;
TQ = Tn * TP;
TC = W[17];
Ip[WS(rs, 4)] = FNMS(TC, TP, TB);
Im[WS(rs, 4)] = FMA(TC, TA, TQ);
}
{
E TS, TV, TT, TW, TR, TU;
TS = To + Tz;
TV = TO - TD;
TR = W[4];
TT = TR * TS;
TW = TR * TV;
TU = W[5];
Ip[WS(rs, 1)] = FNMS(TU, TV, TT);
Im[WS(rs, 1)] = FMA(TU, TS, TW);
}
{
E T11, T12, T13, TX, TZ, T10, T14, TY;
T11 = TI + TN;
T12 = Tt + Ty;
T13 = T11 - T12;
TY = Tb - Tm;
TX = W[10];
TZ = TX * TY;
T10 = W[11];
T14 = T10 * TY;
Rm[0] = T11 + T12;
Rm[WS(rs, 3)] = FMA(TX, T13, T14);
Rp[WS(rs, 3)] = FNMS(T10, T13, TZ);
}
{
E T1k, T1E, T1B, T1H;
{
E T1c, T1j, T1t, T1A;
T1c = T18 + T1b;
T1j = T1f + T1i;
T1k = T1c - T1j;
T1E = T1c + T1j;
T1t = T1p - T1s;
T1A = T1w - T1z;
T1B = T1t - T1A;
T1H = T1t + T1A;
}
{
E T15, T1l, T1m, T1C;
T15 = W[18];
T1l = T15 * T1k;
T1m = W[19];
T1C = T1m * T1k;
Rp[WS(rs, 5)] = FNMS(T1m, T1B, T1l);
Rm[WS(rs, 5)] = FMA(T15, T1B, T1C);
}
{
E T1D, T1F, T1G, T1I;
T1D = W[6];
T1F = T1D * T1E;
T1G = W[7];
T1I = T1G * T1E;
Rp[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
Rm[WS(rs, 2)] = FMA(T1D, T1H, T1I);
}
}
{
E T26, T2i, T2f, T2l;
{
E T22, T25, T2b, T2e;
T22 = T20 + T21;
T25 = T23 + T24;
T26 = T22 - T25;
T2i = T22 + T25;
T2b = T29 - T2a;
T2e = T2c - T2d;
T2f = T2b - T2e;
T2l = T2b + T2e;
}
{
E T1Z, T27, T28, T2g;
T1Z = W[2];
T27 = T1Z * T26;
T28 = W[3];
T2g = T28 * T26;
Rp[WS(rs, 1)] = FNMS(T28, T2f, T27);
Rm[WS(rs, 1)] = FMA(T1Z, T2f, T2g);
}
{
E T2h, T2j, T2k, T2m;
T2h = W[14];
T2j = T2h * T2i;
T2k = W[15];
T2m = T2k * T2i;
Rp[WS(rs, 4)] = FNMS(T2k, T2l, T2j);
Rm[WS(rs, 4)] = FMA(T2h, T2l, T2m);
}
}
{
E T2q, T2y, T2v, T2B;
{
E T2o, T2p, T2t, T2u;
T2o = T20 - T21;
T2p = T2c + T2d;
T2q = T2o - T2p;
T2y = T2o + T2p;
T2t = T29 + T2a;
T2u = T23 - T24;
T2v = T2t + T2u;
T2B = T2t - T2u;
}
{
E T2r, T2w, T2n, T2s;
T2n = W[8];
T2r = T2n * T2q;
T2w = T2n * T2v;
T2s = W[9];
Ip[WS(rs, 2)] = FNMS(T2s, T2v, T2r);
Im[WS(rs, 2)] = FMA(T2s, T2q, T2w);
}
{
E T2z, T2C, T2x, T2A;
T2x = W[20];
T2z = T2x * T2y;
T2C = T2x * T2B;
T2A = W[21];
Ip[WS(rs, 5)] = FNMS(T2A, T2B, T2z);
Im[WS(rs, 5)] = FMA(T2A, T2y, T2C);
}
}
{
E T1M, T1U, T1R, T1X;
{
E T1K, T1L, T1P, T1Q;
T1K = T18 - T1b;
T1L = T1w + T1z;
T1M = T1K - T1L;
T1U = T1K + T1L;
T1P = T1p + T1s;
T1Q = T1f - T1i;
T1R = T1P + T1Q;
T1X = T1P - T1Q;
}
{
E T1N, T1S, T1J, T1O;
T1J = W[0];
T1N = T1J * T1M;
T1S = T1J * T1R;
T1O = W[1];
Ip[0] = FNMS(T1O, T1R, T1N);
Im[0] = FMA(T1O, T1M, T1S);
}
{
E T1V, T1Y, T1T, T1W;
T1T = W[12];
T1V = T1T * T1U;
T1Y = T1T * T1X;
T1W = W[13];
Ip[WS(rs, 3)] = FNMS(T1W, T1X, T1V);
Im[WS(rs, 3)] = FMA(T1W, T1U, T1Y);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
void X(codelet_hc2cb_12) (planner *p) {
X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cb_12 -include rdft/scalar/hc2cb.h */
/*
* This function contains 118 FP additions, 60 FP multiplications,
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
* 39 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E T5, TH, T12, T1M, T1i, T1U, Tl, Ty, T1c, T1Y, T1s, T1Q, Ta, TM, T15;
E T1N, T1l, T1V, Tg, Tt, T19, T1X, T1p, T1P;
{
E T1, TD, T4, T1g, TG, T11, T10, T1h;
T1 = Rp[0];
TD = Ip[0];
{
E T2, T3, TE, TF;
T2 = Rp[WS(rs, 4)];
T3 = Rm[WS(rs, 3)];
T4 = T2 + T3;
T1g = KP866025403 * (T2 - T3);
TE = Ip[WS(rs, 4)];
TF = Im[WS(rs, 3)];
TG = TE - TF;
T11 = KP866025403 * (TE + TF);
}
T5 = T1 + T4;
TH = TD + TG;
T10 = FNMS(KP500000000, T4, T1);
T12 = T10 - T11;
T1M = T10 + T11;
T1h = FNMS(KP500000000, TG, TD);
T1i = T1g + T1h;
T1U = T1h - T1g;
}
{
E Th, Tx, Tk, T1a, Tw, T1r, T1b, T1q;
Th = Rm[WS(rs, 2)];
Tx = Im[WS(rs, 2)];
{
E Ti, Tj, Tu, Tv;
Ti = Rp[WS(rs, 1)];
Tj = Rp[WS(rs, 5)];
Tk = Ti + Tj;
T1a = KP866025403 * (Ti - Tj);
Tu = Ip[WS(rs, 1)];
Tv = Ip[WS(rs, 5)];
Tw = Tu + Tv;
T1r = KP866025403 * (Tv - Tu);
}
Tl = Th + Tk;
Ty = Tw - Tx;
T1b = FMA(KP500000000, Tw, Tx);
T1c = T1a - T1b;
T1Y = T1a + T1b;
T1q = FNMS(KP500000000, Tk, Th);
T1s = T1q + T1r;
T1Q = T1q - T1r;
}
{
E T6, TL, T9, T1j, TK, T14, T13, T1k;
T6 = Rm[WS(rs, 5)];
TL = Im[WS(rs, 5)];
{
E T7, T8, TI, TJ;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 2)];
T9 = T7 + T8;
T1j = KP866025403 * (T7 - T8);
TI = Ip[WS(rs, 2)];
TJ = Im[WS(rs, 1)];
TK = TI - TJ;
T14 = KP866025403 * (TI + TJ);
}
Ta = T6 + T9;
TM = TK - TL;
T13 = FNMS(KP500000000, T9, T6);
T15 = T13 + T14;
T1N = T13 - T14;
T1k = FMA(KP500000000, TK, TL);
T1l = T1j - T1k;
T1V = T1j + T1k;
}
{
E Tc, Tp, Tf, T17, Ts, T1o, T18, T1n;
Tc = Rp[WS(rs, 3)];
Tp = Ip[WS(rs, 3)];
{
E Td, Te, Tq, Tr;
Td = Rm[WS(rs, 4)];
Te = Rm[0];
Tf = Td + Te;
T17 = KP866025403 * (Td - Te);
Tq = Im[WS(rs, 4)];
Tr = Im[0];
Ts = Tq + Tr;
T1o = KP866025403 * (Tq - Tr);
}
Tg = Tc + Tf;
Tt = Tp - Ts;
T18 = FMA(KP500000000, Ts, Tp);
T19 = T17 + T18;
T1X = T18 - T17;
T1n = FNMS(KP500000000, Tf, Tc);
T1p = T1n + T1o;
T1P = T1n - T1o;
}
{
E Tb, Tm, TU, TW, TX, TY, TT, TV;
Tb = T5 + Ta;
Tm = Tg + Tl;
TU = Tb - Tm;
TW = TH + TM;
TX = Tt + Ty;
TY = TW - TX;
Rp[0] = Tb + Tm;
Rm[0] = TW + TX;
TT = W[10];
TV = W[11];
Rp[WS(rs, 3)] = FNMS(TV, TY, TT * TU);
Rm[WS(rs, 3)] = FMA(TV, TU, TT * TY);
}
{
E TA, TQ, TO, TS;
{
E To, Tz, TC, TN;
To = T5 - Ta;
Tz = Tt - Ty;
TA = To - Tz;
TQ = To + Tz;
TC = Tg - Tl;
TN = TH - TM;
TO = TC + TN;
TS = TN - TC;
}
{
E Tn, TB, TP, TR;
Tn = W[16];
TB = W[17];
Ip[WS(rs, 4)] = FNMS(TB, TO, Tn * TA);
Im[WS(rs, 4)] = FMA(Tn, TO, TB * TA);
TP = W[4];
TR = W[5];
Ip[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
Im[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
}
}
{
E T28, T2e, T2c, T2g;
{
E T26, T27, T2a, T2b;
T26 = T1M - T1N;
T27 = T1X + T1Y;
T28 = T26 - T27;
T2e = T26 + T27;
T2a = T1U + T1V;
T2b = T1P - T1Q;
T2c = T2a + T2b;
T2g = T2a - T2b;
}
{
E T25, T29, T2d, T2f;
T25 = W[8];
T29 = W[9];
Ip[WS(rs, 2)] = FNMS(T29, T2c, T25 * T28);
Im[WS(rs, 2)] = FMA(T25, T2c, T29 * T28);
T2d = W[20];
T2f = W[21];
Ip[WS(rs, 5)] = FNMS(T2f, T2g, T2d * T2e);
Im[WS(rs, 5)] = FMA(T2d, T2g, T2f * T2e);
}
}
{
E T1S, T22, T20, T24;
{
E T1O, T1R, T1W, T1Z;
T1O = T1M + T1N;
T1R = T1P + T1Q;
T1S = T1O - T1R;
T22 = T1O + T1R;
T1W = T1U - T1V;
T1Z = T1X - T1Y;
T20 = T1W - T1Z;
T24 = T1W + T1Z;
}
{
E T1L, T1T, T21, T23;
T1L = W[2];
T1T = W[3];
Rp[WS(rs, 1)] = FNMS(T1T, T20, T1L * T1S);
Rm[WS(rs, 1)] = FMA(T1T, T1S, T1L * T20);
T21 = W[14];
T23 = W[15];
Rp[WS(rs, 4)] = FNMS(T23, T24, T21 * T22);
Rm[WS(rs, 4)] = FMA(T23, T22, T21 * T24);
}
}
{
E T1C, T1I, T1G, T1K;
{
E T1A, T1B, T1E, T1F;
T1A = T12 + T15;
T1B = T1p + T1s;
T1C = T1A - T1B;
T1I = T1A + T1B;
T1E = T1i + T1l;
T1F = T19 + T1c;
T1G = T1E - T1F;
T1K = T1E + T1F;
}
{
E T1z, T1D, T1H, T1J;
T1z = W[18];
T1D = W[19];
Rp[WS(rs, 5)] = FNMS(T1D, T1G, T1z * T1C);
Rm[WS(rs, 5)] = FMA(T1D, T1C, T1z * T1G);
T1H = W[6];
T1J = W[7];
Rp[WS(rs, 2)] = FNMS(T1J, T1K, T1H * T1I);
Rm[WS(rs, 2)] = FMA(T1J, T1I, T1H * T1K);
}
}
{
E T1e, T1w, T1u, T1y;
{
E T16, T1d, T1m, T1t;
T16 = T12 - T15;
T1d = T19 - T1c;
T1e = T16 - T1d;
T1w = T16 + T1d;
T1m = T1i - T1l;
T1t = T1p - T1s;
T1u = T1m + T1t;
T1y = T1m - T1t;
}
{
E TZ, T1f, T1v, T1x;
TZ = W[0];
T1f = W[1];
Ip[0] = FNMS(T1f, T1u, TZ * T1e);
Im[0] = FMA(TZ, T1u, T1f * T1e);
T1v = W[12];
T1x = W[13];
Ip[WS(rs, 3)] = FNMS(T1x, T1y, T1v * T1w);
Im[WS(rs, 3)] = FMA(T1v, T1y, T1x * T1w);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cb_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
void X(codelet_hc2cb_12) (planner *p) {
X(khc2c_register) (p, hc2cb_12, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,833 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cb_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 174 FP additions, 100 FP multiplications,
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
* 63 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E TA, T1O, T21, T1h, T2P, T2S, T3b, T3p, T3q, T3D, T1k, T1P, Tf, T3y, T2A;
E T36, TL, T22, T3s, T3t, T3z, T2F, T2U, T2K, T2V, Tu, T3E, TX, T1n, T1T;
E T24, T1W, T25, T18, T1m;
{
E T3, Tw, T1g, T2Q, T6, T1d, Tz, T2R, Ta, TB, TE, T2y, Td, TG, TJ;
E T2x;
{
E T1, T2, T1e, T1f;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
Tw = T1 - T2;
T1e = Ip[0];
T1f = Im[WS(rs, 7)];
T1g = T1e + T1f;
T2Q = T1e - T1f;
}
{
E T4, T5, Tx, Ty;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
T1d = T4 - T5;
Tx = Ip[WS(rs, 4)];
Ty = Im[WS(rs, 3)];
Tz = Tx + Ty;
T2R = Tx - Ty;
}
{
E T8, T9, TC, TD;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
Ta = T8 + T9;
TB = T8 - T9;
TC = Ip[WS(rs, 2)];
TD = Im[WS(rs, 5)];
TE = TC + TD;
T2y = TC - TD;
}
{
E Tb, Tc, TH, TI;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
Td = Tb + Tc;
TG = Tb - Tc;
TH = Ip[WS(rs, 6)];
TI = Im[WS(rs, 1)];
TJ = TH + TI;
T2x = TH - TI;
}
TA = Tw - Tz;
T1O = Tw + Tz;
T21 = T1g - T1d;
T1h = T1d + T1g;
T2P = Ta - Td;
T2S = T2Q - T2R;
T3b = T2S - T2P;
{
E T1i, T1j, T7, Te;
T3p = T2Q + T2R;
T3q = T2y + T2x;
T3D = T3p - T3q;
T1i = TB + TE;
T1j = TG + TJ;
T1k = T1i - T1j;
T1P = T1i + T1j;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T3y = T7 - Te;
{
E T2w, T2z, TF, TK;
T2w = T3 - T6;
T2z = T2x - T2y;
T2A = T2w + T2z;
T36 = T2w - T2z;
TF = TB - TE;
TK = TG - TJ;
TL = TF + TK;
T22 = TF - TK;
}
}
}
{
E Ti, T13, T11, T2C, Tl, TY, T16, T2D, Tp, TS, TQ, T2H, Ts, TN, TV;
E T2I, T2B, T2E;
{
E Tg, Th, TZ, T10;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
T13 = Tg - Th;
TZ = Ip[WS(rs, 1)];
T10 = Im[WS(rs, 6)];
T11 = TZ + T10;
T2C = TZ - T10;
}
{
E Tj, Tk, T14, T15;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
TY = Tj - Tk;
T14 = Ip[WS(rs, 5)];
T15 = Im[WS(rs, 2)];
T16 = T14 + T15;
T2D = T14 - T15;
}
{
E Tn, To, TO, TP;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
TS = Tn - To;
TO = Ip[WS(rs, 7)];
TP = Im[0];
TQ = TO + TP;
T2H = TO - TP;
}
{
E Tq, Tr, TT, TU;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
TN = Tq - Tr;
TT = Ip[WS(rs, 3)];
TU = Im[WS(rs, 4)];
TV = TT + TU;
T2I = TT - TU;
}
T3s = T2C + T2D;
T3t = T2H + T2I;
T3z = T3t - T3s;
T2B = Ti - Tl;
T2E = T2C - T2D;
T2F = T2B - T2E;
T2U = T2B + T2E;
{
E T2G, T2J, Tm, Tt;
T2G = Tp - Ts;
T2J = T2H - T2I;
T2K = T2G + T2J;
T2V = T2J - T2G;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T3E = Tm - Tt;
}
{
E TR, TW, T1R, T1S;
TR = TN - TQ;
TW = TS - TV;
TX = FNMS(KP414213562, TW, TR);
T1n = FMA(KP414213562, TR, TW);
T1R = T11 - TY;
T1S = T13 + T16;
T1T = FNMS(KP414213562, T1S, T1R);
T24 = FMA(KP414213562, T1R, T1S);
}
{
E T1U, T1V, T12, T17;
T1U = TN + TQ;
T1V = TS + TV;
T1W = FNMS(KP414213562, T1V, T1U);
T25 = FMA(KP414213562, T1U, T1V);
T12 = TY + T11;
T17 = T13 - T16;
T18 = FMA(KP414213562, T17, T12);
T1m = FNMS(KP414213562, T12, T17);
}
}
Rp[0] = Tf + Tu;
{
E T3r, T3u, T3v, T3l, T3n, T3o, T3w, T3m;
T3r = T3p + T3q;
T3u = T3s + T3t;
T3v = T3r - T3u;
T3m = Tf - Tu;
T3l = W[14];
T3n = T3l * T3m;
T3o = W[15];
T3w = T3o * T3m;
Rm[0] = T3r + T3u;
Rm[WS(rs, 4)] = FMA(T3l, T3v, T3w);
Rp[WS(rs, 4)] = FNMS(T3o, T3v, T3n);
}
{
E T3A, T3F, T3B, T3G, T3x, T3C;
T3A = T3y - T3z;
T3F = T3D - T3E;
T3x = W[22];
T3B = T3x * T3A;
T3G = T3x * T3F;
T3C = W[23];
Rp[WS(rs, 6)] = FNMS(T3C, T3F, T3B);
Rm[WS(rs, 6)] = FMA(T3C, T3A, T3G);
}
{
E T3I, T3L, T3J, T3M, T3H, T3K;
T3I = T3y + T3z;
T3L = T3E + T3D;
T3H = W[6];
T3J = T3H * T3I;
T3M = T3H * T3L;
T3K = W[7];
Rp[WS(rs, 2)] = FNMS(T3K, T3L, T3J);
Rm[WS(rs, 2)] = FMA(T3K, T3I, T3M);
}
{
E T38, T3g, T3d, T3j, T37, T3c;
T37 = T2V - T2U;
T38 = FNMS(KP707106781, T37, T36);
T3g = FMA(KP707106781, T37, T36);
T3c = T2F - T2K;
T3d = FNMS(KP707106781, T3c, T3b);
T3j = FMA(KP707106781, T3c, T3b);
{
E T39, T3e, T35, T3a;
T35 = W[26];
T39 = T35 * T38;
T3e = T35 * T3d;
T3a = W[27];
Rp[WS(rs, 7)] = FNMS(T3a, T3d, T39);
Rm[WS(rs, 7)] = FMA(T3a, T38, T3e);
}
{
E T3h, T3k, T3f, T3i;
T3f = W[10];
T3h = T3f * T3g;
T3k = T3f * T3j;
T3i = W[11];
Rp[WS(rs, 3)] = FNMS(T3i, T3j, T3h);
Rm[WS(rs, 3)] = FMA(T3i, T3g, T3k);
}
}
{
E T2M, T30, T2X, T33, T2L, T2T, T2W;
T2L = T2F + T2K;
T2M = FNMS(KP707106781, T2L, T2A);
T30 = FMA(KP707106781, T2L, T2A);
T2T = T2P + T2S;
T2W = T2U + T2V;
T2X = FNMS(KP707106781, T2W, T2T);
T33 = FMA(KP707106781, T2W, T2T);
{
E T2v, T2N, T2O, T2Y;
T2v = W[18];
T2N = T2v * T2M;
T2O = W[19];
T2Y = T2O * T2M;
Rp[WS(rs, 5)] = FNMS(T2O, T2X, T2N);
Rm[WS(rs, 5)] = FMA(T2v, T2X, T2Y);
}
{
E T2Z, T31, T32, T34;
T2Z = W[2];
T31 = T2Z * T30;
T32 = W[3];
T34 = T32 * T30;
Rp[WS(rs, 1)] = FNMS(T32, T33, T31);
Rm[WS(rs, 1)] = FMA(T2Z, T33, T34);
}
}
{
E T1Y, T2a, T27, T2d;
{
E T1Q, T1X, T23, T26;
T1Q = FNMS(KP707106781, T1P, T1O);
T1X = T1T + T1W;
T1Y = FMA(KP923879532, T1X, T1Q);
T2a = FNMS(KP923879532, T1X, T1Q);
T23 = FMA(KP707106781, T22, T21);
T26 = T24 - T25;
T27 = FNMS(KP923879532, T26, T23);
T2d = FMA(KP923879532, T26, T23);
}
{
E T1N, T1Z, T20, T28;
T1N = W[20];
T1Z = T1N * T1Y;
T20 = W[21];
T28 = T20 * T1Y;
Ip[WS(rs, 5)] = FNMS(T20, T27, T1Z);
Im[WS(rs, 5)] = FMA(T1N, T27, T28);
}
{
E T29, T2b, T2c, T2e;
T29 = W[4];
T2b = T29 * T2a;
T2c = W[5];
T2e = T2c * T2a;
Ip[WS(rs, 1)] = FNMS(T2c, T2d, T2b);
Im[WS(rs, 1)] = FMA(T29, T2d, T2e);
}
}
{
E T1a, T1s, T1p, T1v;
{
E TM, T19, T1l, T1o;
TM = FNMS(KP707106781, TL, TA);
T19 = TX - T18;
T1a = FNMS(KP923879532, T19, TM);
T1s = FMA(KP923879532, T19, TM);
T1l = FNMS(KP707106781, T1k, T1h);
T1o = T1m - T1n;
T1p = FNMS(KP923879532, T1o, T1l);
T1v = FMA(KP923879532, T1o, T1l);
}
{
E Tv, T1b, T1c, T1q;
Tv = W[24];
T1b = Tv * T1a;
T1c = W[25];
T1q = T1c * T1a;
Ip[WS(rs, 6)] = FNMS(T1c, T1p, T1b);
Im[WS(rs, 6)] = FMA(Tv, T1p, T1q);
}
{
E T1r, T1t, T1u, T1w;
T1r = W[8];
T1t = T1r * T1s;
T1u = W[9];
T1w = T1u * T1s;
Ip[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
Im[WS(rs, 2)] = FMA(T1r, T1v, T1w);
}
}
{
E T2i, T2q, T2n, T2t;
{
E T2g, T2h, T2l, T2m;
T2g = FMA(KP707106781, T1P, T1O);
T2h = T24 + T25;
T2i = FNMS(KP923879532, T2h, T2g);
T2q = FMA(KP923879532, T2h, T2g);
T2l = FNMS(KP707106781, T22, T21);
T2m = T1W - T1T;
T2n = FMA(KP923879532, T2m, T2l);
T2t = FNMS(KP923879532, T2m, T2l);
}
{
E T2j, T2o, T2f, T2k;
T2f = W[12];
T2j = T2f * T2i;
T2o = T2f * T2n;
T2k = W[13];
Ip[WS(rs, 3)] = FNMS(T2k, T2n, T2j);
Im[WS(rs, 3)] = FMA(T2k, T2i, T2o);
}
{
E T2r, T2u, T2p, T2s;
T2p = W[28];
T2r = T2p * T2q;
T2u = T2p * T2t;
T2s = W[29];
Ip[WS(rs, 7)] = FNMS(T2s, T2t, T2r);
Im[WS(rs, 7)] = FMA(T2s, T2q, T2u);
}
}
{
E T1A, T1I, T1F, T1L;
{
E T1y, T1z, T1D, T1E;
T1y = FMA(KP707106781, TL, TA);
T1z = T1m + T1n;
T1A = FNMS(KP923879532, T1z, T1y);
T1I = FMA(KP923879532, T1z, T1y);
T1D = FMA(KP707106781, T1k, T1h);
T1E = T18 + TX;
T1F = FNMS(KP923879532, T1E, T1D);
T1L = FMA(KP923879532, T1E, T1D);
}
{
E T1B, T1G, T1x, T1C;
T1x = W[16];
T1B = T1x * T1A;
T1G = T1x * T1F;
T1C = W[17];
Ip[WS(rs, 4)] = FNMS(T1C, T1F, T1B);
Im[WS(rs, 4)] = FMA(T1C, T1A, T1G);
}
{
E T1J, T1M, T1H, T1K;
T1H = W[0];
T1J = T1H * T1I;
T1M = T1H * T1L;
T1K = W[1];
Ip[0] = FNMS(T1K, T1L, T1J);
Im[0] = FMA(T1K, T1I, T1M);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cb_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
void X(codelet_hc2cb_16) (planner *p) {
X(khc2c_register) (p, hc2cb_16, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cb_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 174 FP additions, 84 FP multiplications,
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
* 50 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
E T2S, T30;
{
E T3, T1Q, T13, T2j, T6, T2i, T16, T1R;
{
E T1, T2, T11, T12;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
T1Q = T1 - T2;
T11 = Ip[0];
T12 = Im[WS(rs, 7)];
T13 = T11 - T12;
T2j = T11 + T12;
}
{
E T4, T5, T14, T15;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
T2i = T4 - T5;
T14 = Ip[WS(rs, 4)];
T15 = Im[WS(rs, 3)];
T16 = T14 - T15;
T1R = T14 + T15;
}
T7 = T3 + T6;
T2K = T1Q + T1R;
T2W = T2j - T2i;
Tw = T3 - T6;
T17 = T13 - T16;
T1S = T1Q - T1R;
T2k = T2i + T2j;
T1w = T13 + T16;
}
{
E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
{
E T8, T9, TA, TB;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
Ta = T8 + T9;
T1T = T8 - T9;
TA = Ip[WS(rs, 2)];
TB = Im[WS(rs, 5)];
TC = TA - TB;
T1U = TA + TB;
}
{
E Tb, Tc, Tx, Ty;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
Td = Tb + Tc;
T1W = Tb - Tc;
Tx = Ip[WS(rs, 6)];
Ty = Im[WS(rs, 1)];
Tz = Tx - Ty;
T1X = Tx + Ty;
}
Te = Ta + Td;
TD = Tz - TC;
T1x = TC + Tz;
T10 = Ta - Td;
{
E T2l, T2m, T1V, T1Y;
T2l = T1T + T1U;
T2m = T1W + T1X;
T2n = KP707106781 * (T2l - T2m);
T2L = KP707106781 * (T2l + T2m);
T1V = T1T - T1U;
T1Y = T1W - T1X;
T1Z = KP707106781 * (T1V + T1Y);
T2X = KP707106781 * (T1V - T1Y);
}
}
{
E Ti, T2b, TI, T29, Tl, T28, TL, T2c, TF, TM;
{
E Tg, Th, TG, TH;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
T2b = Tg - Th;
TG = Ip[WS(rs, 1)];
TH = Im[WS(rs, 6)];
TI = TG - TH;
T29 = TG + TH;
}
{
E Tj, Tk, TJ, TK;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
T28 = Tj - Tk;
TJ = Ip[WS(rs, 5)];
TK = Im[WS(rs, 2)];
TL = TJ - TK;
T2c = TJ + TK;
}
Tm = Ti + Tl;
T1z = TI + TL;
TF = Ti - Tl;
TM = TI - TL;
TN = TF - TM;
T19 = TF + TM;
{
E T2a, T2d, T2N, T2O;
T2a = T28 + T29;
T2d = T2b - T2c;
T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
T2N = T2b + T2c;
T2O = T29 - T28;
T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
}
}
{
E Tp, T24, TR, T22, Ts, T21, TU, T25, TO, TV;
{
E Tn, To, TP, TQ;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T24 = Tn - To;
TP = Ip[WS(rs, 7)];
TQ = Im[0];
TR = TP - TQ;
T22 = TP + TQ;
}
{
E Tq, Tr, TS, TT;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
T21 = Tq - Tr;
TS = Ip[WS(rs, 3)];
TT = Im[WS(rs, 4)];
TU = TS - TT;
T25 = TS + TT;
}
Tt = Tp + Ts;
T1A = TR + TU;
TO = Tp - Ts;
TV = TR - TU;
TW = TO + TV;
T1a = TV - TO;
{
E T23, T26, T2Q, T2R;
T23 = T21 - T22;
T26 = T24 - T25;
T27 = FNMS(KP382683432, T26, KP923879532 * T23);
T2q = FMA(KP382683432, T23, KP923879532 * T26);
T2Q = T24 + T25;
T2R = T21 + T22;
T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
}
}
{
E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
Tf = T7 + Te;
Tu = Tm + Tt;
T1u = Tf - Tu;
T1y = T1w + T1x;
T1B = T1z + T1A;
T1C = T1y - T1B;
Rp[0] = Tf + Tu;
Rm[0] = T1y + T1B;
T1t = W[14];
T1v = W[15];
Rp[WS(rs, 4)] = FNMS(T1v, T1C, T1t * T1u);
Rm[WS(rs, 4)] = FMA(T1v, T1u, T1t * T1C);
}
{
E T2U, T34, T32, T36;
{
E T2M, T2T, T2Y, T31;
T2M = T2K - T2L;
T2T = T2P + T2S;
T2U = T2M - T2T;
T34 = T2M + T2T;
T2Y = T2W + T2X;
T31 = T2Z - T30;
T32 = T2Y - T31;
T36 = T2Y + T31;
}
{
E T2J, T2V, T33, T35;
T2J = W[20];
T2V = W[21];
Ip[WS(rs, 5)] = FNMS(T2V, T32, T2J * T2U);
Im[WS(rs, 5)] = FMA(T2V, T2U, T2J * T32);
T33 = W[4];
T35 = W[5];
Ip[WS(rs, 1)] = FNMS(T35, T36, T33 * T34);
Im[WS(rs, 1)] = FMA(T35, T34, T33 * T36);
}
}
{
E T3a, T3g, T3e, T3i;
{
E T38, T39, T3c, T3d;
T38 = T2K + T2L;
T39 = T2Z + T30;
T3a = T38 - T39;
T3g = T38 + T39;
T3c = T2W - T2X;
T3d = T2P - T2S;
T3e = T3c + T3d;
T3i = T3c - T3d;
}
{
E T37, T3b, T3f, T3h;
T37 = W[12];
T3b = W[13];
Ip[WS(rs, 3)] = FNMS(T3b, T3e, T37 * T3a);
Im[WS(rs, 3)] = FMA(T37, T3e, T3b * T3a);
T3f = W[28];
T3h = W[29];
Ip[WS(rs, 7)] = FNMS(T3h, T3i, T3f * T3g);
Im[WS(rs, 7)] = FMA(T3f, T3i, T3h * T3g);
}
}
{
E TY, T1e, T1c, T1g;
{
E TE, TX, T18, T1b;
TE = Tw + TD;
TX = KP707106781 * (TN + TW);
TY = TE - TX;
T1e = TE + TX;
T18 = T10 + T17;
T1b = KP707106781 * (T19 + T1a);
T1c = T18 - T1b;
T1g = T18 + T1b;
}
{
E Tv, TZ, T1d, T1f;
Tv = W[18];
TZ = W[19];
Rp[WS(rs, 5)] = FNMS(TZ, T1c, Tv * TY);
Rm[WS(rs, 5)] = FMA(TZ, TY, Tv * T1c);
T1d = W[2];
T1f = W[3];
Rp[WS(rs, 1)] = FNMS(T1f, T1g, T1d * T1e);
Rm[WS(rs, 1)] = FMA(T1f, T1e, T1d * T1g);
}
}
{
E T1k, T1q, T1o, T1s;
{
E T1i, T1j, T1m, T1n;
T1i = Tw - TD;
T1j = KP707106781 * (T1a - T19);
T1k = T1i - T1j;
T1q = T1i + T1j;
T1m = T17 - T10;
T1n = KP707106781 * (TN - TW);
T1o = T1m - T1n;
T1s = T1m + T1n;
}
{
E T1h, T1l, T1p, T1r;
T1h = W[26];
T1l = W[27];
Rp[WS(rs, 7)] = FNMS(T1l, T1o, T1h * T1k);
Rm[WS(rs, 7)] = FMA(T1h, T1o, T1l * T1k);
T1p = W[10];
T1r = W[11];
Rp[WS(rs, 3)] = FNMS(T1r, T1s, T1p * T1q);
Rm[WS(rs, 3)] = FMA(T1p, T1s, T1r * T1q);
}
}
{
E T2g, T2u, T2s, T2w;
{
E T20, T2f, T2o, T2r;
T20 = T1S - T1Z;
T2f = T27 - T2e;
T2g = T20 - T2f;
T2u = T20 + T2f;
T2o = T2k - T2n;
T2r = T2p - T2q;
T2s = T2o - T2r;
T2w = T2o + T2r;
}
{
E T1P, T2h, T2t, T2v;
T1P = W[24];
T2h = W[25];
Ip[WS(rs, 6)] = FNMS(T2h, T2s, T1P * T2g);
Im[WS(rs, 6)] = FMA(T2h, T2g, T1P * T2s);
T2t = W[8];
T2v = W[9];
Ip[WS(rs, 2)] = FNMS(T2v, T2w, T2t * T2u);
Im[WS(rs, 2)] = FMA(T2v, T2u, T2t * T2w);
}
}
{
E T2A, T2G, T2E, T2I;
{
E T2y, T2z, T2C, T2D;
T2y = T1S + T1Z;
T2z = T2p + T2q;
T2A = T2y - T2z;
T2G = T2y + T2z;
T2C = T2k + T2n;
T2D = T2e + T27;
T2E = T2C - T2D;
T2I = T2C + T2D;
}
{
E T2x, T2B, T2F, T2H;
T2x = W[16];
T2B = W[17];
Ip[WS(rs, 4)] = FNMS(T2B, T2E, T2x * T2A);
Im[WS(rs, 4)] = FMA(T2x, T2E, T2B * T2A);
T2F = W[0];
T2H = W[1];
Ip[0] = FNMS(T2H, T2I, T2F * T2G);
Im[0] = FMA(T2F, T2I, T2H * T2G);
}
}
{
E T1G, T1M, T1K, T1O;
{
E T1E, T1F, T1I, T1J;
T1E = T7 - Te;
T1F = T1A - T1z;
T1G = T1E - T1F;
T1M = T1E + T1F;
T1I = T1w - T1x;
T1J = Tm - Tt;
T1K = T1I - T1J;
T1O = T1J + T1I;
}
{
E T1D, T1H, T1L, T1N;
T1D = W[22];
T1H = W[23];
Rp[WS(rs, 6)] = FNMS(T1H, T1K, T1D * T1G);
Rm[WS(rs, 6)] = FMA(T1D, T1K, T1H * T1G);
T1L = W[6];
T1N = W[7];
Rp[WS(rs, 2)] = FNMS(T1N, T1O, T1L * T1M);
Rm[WS(rs, 2)] = FMA(T1L, T1O, T1N * T1M);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cb_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
void X(codelet_hc2cb_16) (planner *p) {
X(khc2c_register) (p, hc2cb_16, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cb_2 -include rdft/scalar/hc2cb.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 11 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, T2, T6, T3, T4, T9;
T1 = Rp[0];
T2 = Rm[0];
T6 = T1 - T2;
T3 = Ip[0];
T4 = Im[0];
T9 = T3 + T4;
Rp[0] = T1 + T2;
Rm[0] = T3 - T4;
{
E T5, T7, T8, Ta;
T5 = W[0];
T7 = T5 * T6;
T8 = W[1];
Ta = T8 * T6;
Ip[0] = FNMS(T8, T9, T7);
Im[0] = FMA(T5, T9, Ta);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hc2cb_2) (planner *p) {
X(khc2c_register) (p, hc2cb_2, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cb_2 -include rdft/scalar/hc2cb.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, T2, T6, T3, T4, T8, T5, T7;
T1 = Rp[0];
T2 = Rm[0];
T6 = T1 - T2;
T3 = Ip[0];
T4 = Im[0];
T8 = T3 + T4;
Rp[0] = T1 + T2;
Rm[0] = T3 - T4;
T5 = W[0];
T7 = W[1];
Ip[0] = FNMS(T7, T8, T5 * T6);
Im[0] = FMA(T7, T6, T5 * T8);
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cb_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hc2cb_2) (planner *p) {
X(khc2c_register) (p, hc2cb_2, &desc, HC2C_VIA_RDFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cb_4 -include rdft/scalar/hc2cb.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 22 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T3, T6, T8, Td, Tx, Tu, Tm, Tg, Tr;
{
E Tb, Tc, Tq, Tk, Te, Tf, Tl, Tp;
{
E T1, T2, T4, T5;
Tb = Ip[0];
Tc = Im[WS(rs, 1)];
Tq = Tb + Tc;
T1 = Rp[0];
T2 = Rm[WS(rs, 1)];
T3 = T1 + T2;
Tk = T1 - T2;
Te = Ip[WS(rs, 1)];
Tf = Im[0];
Tl = Te + Tf;
T4 = Rp[WS(rs, 1)];
T5 = Rm[0];
T6 = T4 + T5;
Tp = T4 - T5;
}
T8 = T3 - T6;
Td = Tb - Tc;
Tx = Tq - Tp;
Tu = Tk + Tl;
Tm = Tk - Tl;
Tg = Te - Tf;
Tr = Tp + Tq;
}
Rp[0] = T3 + T6;
Rm[0] = Td + Tg;
{
E Tn, Ts, Tj, To;
Tj = W[0];
Tn = Tj * Tm;
Ts = Tj * Tr;
To = W[1];
Ip[0] = FNMS(To, Tr, Tn);
Im[0] = FMA(To, Tm, Ts);
}
{
E Tv, Ty, Tt, Tw;
Tt = W[4];
Tv = Tt * Tu;
Ty = Tt * Tx;
Tw = W[5];
Ip[WS(rs, 1)] = FNMS(Tw, Tx, Tv);
Im[WS(rs, 1)] = FMA(Tw, Tu, Ty);
}
{
E Th, Ta, Ti, T7, T9;
Th = Td - Tg;
Ta = W[3];
Ti = Ta * T8;
T7 = W[2];
T9 = T7 * T8;
Rp[WS(rs, 1)] = FNMS(Ta, Th, T9);
Rm[WS(rs, 1)] = FMA(T7, Th, Ti);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hc2cb_4) (planner *p) {
X(khc2c_register) (p, hc2cb_4, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cb_4 -include rdft/scalar/hc2cb.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T3, Ti, Tc, Tn, T6, Tm, Tf, Tj;
{
E T1, T2, Ta, Tb;
T1 = Rp[0];
T2 = Rm[WS(rs, 1)];
T3 = T1 + T2;
Ti = T1 - T2;
Ta = Ip[0];
Tb = Im[WS(rs, 1)];
Tc = Ta - Tb;
Tn = Ta + Tb;
}
{
E T4, T5, Td, Te;
T4 = Rp[WS(rs, 1)];
T5 = Rm[0];
T6 = T4 + T5;
Tm = T4 - T5;
Td = Ip[WS(rs, 1)];
Te = Im[0];
Tf = Td - Te;
Tj = Td + Te;
}
Rp[0] = T3 + T6;
Rm[0] = Tc + Tf;
{
E T8, Tg, T7, T9;
T8 = T3 - T6;
Tg = Tc - Tf;
T7 = W[2];
T9 = W[3];
Rp[WS(rs, 1)] = FNMS(T9, Tg, T7 * T8);
Rm[WS(rs, 1)] = FMA(T9, T8, T7 * Tg);
}
{
E Tk, To, Th, Tl;
Tk = Ti - Tj;
To = Tm + Tn;
Th = W[0];
Tl = W[1];
Ip[0] = FNMS(Tl, To, Th * Tk);
Im[0] = FMA(Th, To, Tl * Tk);
}
{
E Tq, Ts, Tp, Tr;
Tq = Ti + Tj;
Ts = Tn - Tm;
Tp = W[4];
Tr = W[5];
Ip[WS(rs, 1)] = FNMS(Tr, Ts, Tp * Tq);
Im[WS(rs, 1)] = FMA(Tp, Ts, Tr * Tq);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cb_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hc2cb_4) (planner *p) {
X(khc2c_register) (p, hc2cb_4, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,292 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cb_6 -include rdft/scalar/hc2cb.h */
/*
* This function contains 46 FP additions, 32 FP multiplications,
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
E Td, Tn, TO, TJ, TN, Tk, Tr, T3, TC, Ts, TQ, Ta, Tm, TF, TG;
{
E Tb, Tc, Tj, TI, Tg, TH;
Tb = Ip[0];
Tc = Im[WS(rs, 2)];
Td = Tb - Tc;
{
E Th, Ti, Te, Tf;
Th = Ip[WS(rs, 1)];
Ti = Im[WS(rs, 1)];
Tj = Th - Ti;
TI = Th + Ti;
Te = Ip[WS(rs, 2)];
Tf = Im[0];
Tg = Te - Tf;
TH = Te + Tf;
}
Tn = Tj - Tg;
TO = TH - TI;
TJ = TH + TI;
TN = Tb + Tc;
Tk = Tg + Tj;
Tr = FNMS(KP500000000, Tk, Td);
}
{
E T9, TE, T6, TD, T1, T2;
T1 = Rp[0];
T2 = Rm[WS(rs, 2)];
T3 = T1 + T2;
TC = T1 - T2;
{
E T7, T8, T4, T5;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 1)];
T9 = T7 + T8;
TE = T7 - T8;
T4 = Rp[WS(rs, 2)];
T5 = Rm[0];
T6 = T4 + T5;
TD = T4 - T5;
}
Ts = T6 - T9;
TQ = TD - TE;
Ta = T6 + T9;
Tm = FNMS(KP500000000, Ta, T3);
TF = TD + TE;
TG = FNMS(KP500000000, TF, TC);
}
Rp[0] = T3 + Ta;
Rm[0] = Td + Tk;
{
E To, Tt, Tp, Tu, Tl, Tq;
To = FNMS(KP866025403, Tn, Tm);
Tt = FNMS(KP866025403, Ts, Tr);
Tl = W[2];
Tp = Tl * To;
Tu = Tl * Tt;
Tq = W[3];
Rp[WS(rs, 1)] = FNMS(Tq, Tt, Tp);
Rm[WS(rs, 1)] = FMA(Tq, To, Tu);
}
{
E T13, TZ, T11, T12, T14, T10;
T13 = TN + TO;
T10 = TC + TF;
TZ = W[4];
T11 = TZ * T10;
T12 = W[5];
T14 = T12 * T10;
Ip[WS(rs, 1)] = FNMS(T12, T13, T11);
Im[WS(rs, 1)] = FMA(TZ, T13, T14);
}
{
E Tw, Tz, Tx, TA, Tv, Ty;
Tw = FMA(KP866025403, Tn, Tm);
Tz = FMA(KP866025403, Ts, Tr);
Tv = W[6];
Tx = Tv * Tw;
TA = Tv * Tz;
Ty = W[7];
Rp[WS(rs, 2)] = FNMS(Ty, Tz, Tx);
Rm[WS(rs, 2)] = FMA(Ty, Tw, TA);
}
{
E TR, TX, TT, TV, TW, TY, TB, TL, TM, TS, TP, TU, TK;
TP = FNMS(KP500000000, TO, TN);
TR = FMA(KP866025403, TQ, TP);
TX = FNMS(KP866025403, TQ, TP);
TU = FMA(KP866025403, TJ, TG);
TT = W[8];
TV = TT * TU;
TW = W[9];
TY = TW * TU;
TK = FNMS(KP866025403, TJ, TG);
TB = W[0];
TL = TB * TK;
TM = W[1];
TS = TM * TK;
Ip[0] = FNMS(TM, TR, TL);
Im[0] = FMA(TB, TR, TS);
Ip[WS(rs, 2)] = FNMS(TW, TX, TV);
Im[WS(rs, 2)] = FMA(TT, TX, TY);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 6, "hc2cb_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
void X(codelet_hc2cb_6) (planner *p) {
X(khc2c_register) (p, hc2cb_6, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cb_6 -include rdft/scalar/hc2cb.h */
/*
* This function contains 46 FP additions, 28 FP multiplications,
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
* 25 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
E T3, Ty, Td, TE, Ta, TO, Tr, TB, Tk, TL, Tn, TH;
{
E T1, T2, Tb, Tc;
T1 = Rp[0];
T2 = Rm[WS(rs, 2)];
T3 = T1 + T2;
Ty = T1 - T2;
Tb = Ip[0];
Tc = Im[WS(rs, 2)];
Td = Tb - Tc;
TE = Tb + Tc;
}
{
E T6, Tz, T9, TA;
{
E T4, T5, T7, T8;
T4 = Rp[WS(rs, 2)];
T5 = Rm[0];
T6 = T4 + T5;
Tz = T4 - T5;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 1)];
T9 = T7 + T8;
TA = T7 - T8;
}
Ta = T6 + T9;
TO = KP866025403 * (Tz - TA);
Tr = KP866025403 * (T6 - T9);
TB = Tz + TA;
}
{
E Tg, TG, Tj, TF;
{
E Te, Tf, Th, Ti;
Te = Ip[WS(rs, 2)];
Tf = Im[0];
Tg = Te - Tf;
TG = Te + Tf;
Th = Ip[WS(rs, 1)];
Ti = Im[WS(rs, 1)];
Tj = Th - Ti;
TF = Th + Ti;
}
Tk = Tg + Tj;
TL = KP866025403 * (TG + TF);
Tn = KP866025403 * (Tj - Tg);
TH = TF - TG;
}
Rp[0] = T3 + Ta;
Rm[0] = Td + Tk;
{
E TC, TI, Tx, TD;
TC = Ty + TB;
TI = TE - TH;
Tx = W[4];
TD = W[5];
Ip[WS(rs, 1)] = FNMS(TD, TI, Tx * TC);
Im[WS(rs, 1)] = FMA(TD, TC, Tx * TI);
}
{
E To, Tu, Ts, Tw, Tm, Tq;
Tm = FNMS(KP500000000, Ta, T3);
To = Tm - Tn;
Tu = Tm + Tn;
Tq = FNMS(KP500000000, Tk, Td);
Ts = Tq - Tr;
Tw = Tr + Tq;
{
E Tl, Tp, Tt, Tv;
Tl = W[2];
Tp = W[3];
Rp[WS(rs, 1)] = FNMS(Tp, Ts, Tl * To);
Rm[WS(rs, 1)] = FMA(Tl, Ts, Tp * To);
Tt = W[6];
Tv = W[7];
Rp[WS(rs, 2)] = FNMS(Tv, Tw, Tt * Tu);
Rm[WS(rs, 2)] = FMA(Tt, Tw, Tv * Tu);
}
}
{
E TM, TS, TQ, TU, TK, TP;
TK = FNMS(KP500000000, TB, Ty);
TM = TK - TL;
TS = TK + TL;
TP = FMA(KP500000000, TH, TE);
TQ = TO + TP;
TU = TP - TO;
{
E TJ, TN, TR, TT;
TJ = W[0];
TN = W[1];
Ip[0] = FNMS(TN, TQ, TJ * TM);
Im[0] = FMA(TN, TM, TJ * TQ);
TR = W[8];
TT = W[9];
Ip[WS(rs, 2)] = FNMS(TT, TU, TR * TS);
Im[WS(rs, 2)] = FMA(TT, TS, TR * TU);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 6, "hc2cb_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
void X(codelet_hc2cb_6) (planner *p) {
X(khc2c_register) (p, hc2cb_6, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,373 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:07 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include rdft/scalar/hc2cb.h */
/*
* This function contains 66 FP additions, 36 FP multiplications,
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
* 33 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T1i, T1n, Tk, TD, TV, T1b, TQ, Te, T1e, T1o, T1j, TE, TF, TR;
E Tv, TW;
{
E T3, Tg, TC, T19, T6, Tz, Tj, T1a;
{
E T1, T2, TA, TB;
T1 = Rp[0];
T2 = Rm[WS(rs, 3)];
T3 = T1 + T2;
Tg = T1 - T2;
TA = Ip[0];
TB = Im[WS(rs, 3)];
TC = TA + TB;
T19 = TA - TB;
}
{
E T4, T5, Th, Ti;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 1)];
T6 = T4 + T5;
Tz = T4 - T5;
Th = Ip[WS(rs, 2)];
Ti = Im[WS(rs, 1)];
Tj = Th + Ti;
T1a = Th - Ti;
}
T7 = T3 + T6;
T1i = T3 - T6;
T1n = T19 - T1a;
Tk = Tg - Tj;
TD = Tz + TC;
TV = TC - Tz;
T1b = T19 + T1a;
TQ = Tg + Tj;
}
{
E Ta, Tl, To, T1c, Td, Tq, Tt, T1d, Tp, Tu;
{
E T8, T9, Tm, Tn;
T8 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 2)];
Ta = T8 + T9;
Tl = T8 - T9;
Tm = Ip[WS(rs, 1)];
Tn = Im[WS(rs, 2)];
To = Tm + Tn;
T1c = Tm - Tn;
}
{
E Tb, Tc, Tr, Ts;
Tb = Rm[0];
Tc = Rp[WS(rs, 3)];
Td = Tb + Tc;
Tq = Tb - Tc;
Tr = Ip[WS(rs, 3)];
Ts = Im[0];
Tt = Tr + Ts;
T1d = Tr - Ts;
}
Te = Ta + Td;
T1e = T1c + T1d;
T1o = Ta - Td;
T1j = T1d - T1c;
TE = Tl + To;
TF = Tq + Tt;
TR = TE + TF;
Tp = Tl - To;
Tu = Tq - Tt;
Tv = Tp + Tu;
TW = Tp - Tu;
}
Rp[0] = T7 + Te;
Rm[0] = T1b + T1e;
{
E TS, TX, TT, TY, TP, TU;
TS = FNMS(KP707106781, TR, TQ);
TX = FMA(KP707106781, TW, TV);
TP = W[4];
TT = TP * TS;
TY = TP * TX;
TU = W[5];
Ip[WS(rs, 1)] = FNMS(TU, TX, TT);
Im[WS(rs, 1)] = FMA(TU, TS, TY);
}
{
E T1s, T1v, T1t, T1w, T1r, T1u;
T1s = T1i + T1j;
T1v = T1o + T1n;
T1r = W[2];
T1t = T1r * T1s;
T1w = T1r * T1v;
T1u = W[3];
Rp[WS(rs, 1)] = FNMS(T1u, T1v, T1t);
Rm[WS(rs, 1)] = FMA(T1u, T1s, T1w);
}
{
E T10, T13, T11, T14, TZ, T12;
T10 = FMA(KP707106781, TR, TQ);
T13 = FNMS(KP707106781, TW, TV);
TZ = W[12];
T11 = TZ * T10;
T14 = TZ * T13;
T12 = W[13];
Ip[WS(rs, 3)] = FNMS(T12, T13, T11);
Im[WS(rs, 3)] = FMA(T12, T10, T14);
}
{
E T1f, T15, T17, T18, T1g, T16;
T1f = T1b - T1e;
T16 = T7 - Te;
T15 = W[6];
T17 = T15 * T16;
T18 = W[7];
T1g = T18 * T16;
Rp[WS(rs, 2)] = FNMS(T18, T1f, T17);
Rm[WS(rs, 2)] = FMA(T15, T1f, T1g);
}
{
E T1k, T1p, T1l, T1q, T1h, T1m;
T1k = T1i - T1j;
T1p = T1n - T1o;
T1h = W[10];
T1l = T1h * T1k;
T1q = T1h * T1p;
T1m = W[11];
Rp[WS(rs, 3)] = FNMS(T1m, T1p, T1l);
Rm[WS(rs, 3)] = FMA(T1m, T1k, T1q);
}
{
E TH, TN, TJ, TL, TM, TO, Tf, Tx, Ty, TI, TG, TK, Tw;
TG = TE - TF;
TH = FNMS(KP707106781, TG, TD);
TN = FMA(KP707106781, TG, TD);
TK = FMA(KP707106781, Tv, Tk);
TJ = W[0];
TL = TJ * TK;
TM = W[1];
TO = TM * TK;
Tw = FNMS(KP707106781, Tv, Tk);
Tf = W[8];
Tx = Tf * Tw;
Ty = W[9];
TI = Ty * Tw;
Ip[WS(rs, 2)] = FNMS(Ty, TH, Tx);
Im[WS(rs, 2)] = FMA(Tf, TH, TI);
Ip[0] = FNMS(TM, TN, TL);
Im[0] = FMA(TJ, TN, TO);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
void X(codelet_hc2cb_8) (planner *p) {
X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include rdft/scalar/hc2cb.h */
/*
* This function contains 66 FP additions, 32 FP multiplications,
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
* 30 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
E TD;
{
E T3, TK, Tk, TX, T6, TW, Tn, TL;
{
E T1, T2, Ti, Tj;
T1 = Rp[0];
T2 = Rm[WS(rs, 3)];
T3 = T1 + T2;
TK = T1 - T2;
Ti = Ip[0];
Tj = Im[WS(rs, 3)];
Tk = Ti - Tj;
TX = Ti + Tj;
}
{
E T4, T5, Tl, Tm;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 1)];
T6 = T4 + T5;
TW = T4 - T5;
Tl = Ip[WS(rs, 2)];
Tm = Im[WS(rs, 1)];
Tn = Tl - Tm;
TL = Tl + Tm;
}
T7 = T3 + T6;
T18 = TK + TL;
T1c = TX - TW;
To = Tk + Tn;
Ty = T3 - T6;
TM = TK - TL;
TY = TW + TX;
TC = Tk - Tn;
}
{
E Ta, TN, Tr, TO, Td, TQ, Tu, TR;
{
E T8, T9, Tp, Tq;
T8 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 2)];
Ta = T8 + T9;
TN = T8 - T9;
Tp = Ip[WS(rs, 1)];
Tq = Im[WS(rs, 2)];
Tr = Tp - Tq;
TO = Tp + Tq;
}
{
E Tb, Tc, Ts, Tt;
Tb = Rm[0];
Tc = Rp[WS(rs, 3)];
Td = Tb + Tc;
TQ = Tb - Tc;
Ts = Ip[WS(rs, 3)];
Tt = Im[0];
Tu = Ts - Tt;
TR = Ts + Tt;
}
Te = Ta + Td;
TZ = TN + TO;
T10 = TQ + TR;
Tv = Tr + Tu;
Tz = Tu - Tr;
TP = TN - TO;
TS = TQ - TR;
TD = Ta - Td;
}
Rp[0] = T7 + Te;
Rm[0] = To + Tv;
{
E Tg, Tw, Tf, Th;
Tg = T7 - Te;
Tw = To - Tv;
Tf = W[6];
Th = W[7];
Rp[WS(rs, 2)] = FNMS(Th, Tw, Tf * Tg);
Rm[WS(rs, 2)] = FMA(Th, Tg, Tf * Tw);
}
{
E TG, TI, TF, TH;
TG = Ty + Tz;
TI = TD + TC;
TF = W[2];
TH = W[3];
Rp[WS(rs, 1)] = FNMS(TH, TI, TF * TG);
Rm[WS(rs, 1)] = FMA(TF, TI, TH * TG);
}
{
E TA, TE, Tx, TB;
TA = Ty - Tz;
TE = TC - TD;
Tx = W[10];
TB = W[11];
Rp[WS(rs, 3)] = FNMS(TB, TE, Tx * TA);
Rm[WS(rs, 3)] = FMA(Tx, TE, TB * TA);
}
{
E T1a, T1g, T1e, T1i, T19, T1d;
T19 = KP707106781 * (TZ + T10);
T1a = T18 - T19;
T1g = T18 + T19;
T1d = KP707106781 * (TP - TS);
T1e = T1c + T1d;
T1i = T1c - T1d;
{
E T17, T1b, T1f, T1h;
T17 = W[4];
T1b = W[5];
Ip[WS(rs, 1)] = FNMS(T1b, T1e, T17 * T1a);
Im[WS(rs, 1)] = FMA(T17, T1e, T1b * T1a);
T1f = W[12];
T1h = W[13];
Ip[WS(rs, 3)] = FNMS(T1h, T1i, T1f * T1g);
Im[WS(rs, 3)] = FMA(T1f, T1i, T1h * T1g);
}
}
{
E TU, T14, T12, T16, TT, T11;
TT = KP707106781 * (TP + TS);
TU = TM - TT;
T14 = TM + TT;
T11 = KP707106781 * (TZ - T10);
T12 = TY - T11;
T16 = TY + T11;
{
E TJ, TV, T13, T15;
TJ = W[8];
TV = W[9];
Ip[WS(rs, 2)] = FNMS(TV, T12, TJ * TU);
Im[WS(rs, 2)] = FMA(TV, TU, TJ * T12);
T13 = W[0];
T15 = W[1];
Ip[0] = FNMS(T15, T16, T13 * T14);
Im[0] = FMA(T15, T14, T13 * T16);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
void X(codelet_hc2cb_8) (planner *p) {
X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,892 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:14 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft2_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 206 FP additions, 100 FP multiplications,
* (or, 136 additions, 30 multiplications, 70 fused multiply/add),
* 66 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E Tf, T20, T32, T3Q, T3f, T3V, TN, T2a, T1m, T2f, T2G, T3G, T2T, T3L, T1F;
E T26, T2J, T2M, T2N, T2U, T2V, T3H, Tu, T25, T3i, T3R, T1a, T2g, T1y, T21;
E T39, T3W, T1p, T2b;
{
E T3, T1e, TA, T1C, T6, Tx, T1h, T1D, Td, T1A, TL, T1k, Ta, T1z, TG;
E T1j;
{
E T1, T2, T1f, T1g;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
T1e = T1 - T2;
{
E Ty, Tz, T4, T5;
Ty = Ip[0];
Tz = Im[WS(rs, 7)];
TA = Ty + Tz;
T1C = Ty - Tz;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
Tx = T4 - T5;
}
T1f = Ip[WS(rs, 4)];
T1g = Im[WS(rs, 3)];
T1h = T1f + T1g;
T1D = T1f - T1g;
{
E Tb, Tc, TH, TI, TJ, TK;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
TH = Tb - Tc;
TI = Im[WS(rs, 1)];
TJ = Ip[WS(rs, 6)];
TK = TI + TJ;
Td = Tb + Tc;
T1A = TJ - TI;
TL = TH + TK;
T1k = TH - TK;
}
{
E T8, T9, TC, TD, TE, TF;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
TC = T8 - T9;
TD = Ip[WS(rs, 2)];
TE = Im[WS(rs, 5)];
TF = TD + TE;
Ta = T8 + T9;
T1z = TD - TE;
TG = TC + TF;
T1j = TC - TF;
}
}
{
E T7, Te, T30, T31;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T20 = T7 - Te;
T30 = TA - Tx;
T31 = T1j - T1k;
T32 = FMA(KP707106781, T31, T30);
T3Q = FNMS(KP707106781, T31, T30);
}
{
E T3d, T3e, TB, TM;
T3d = T1e + T1h;
T3e = TG + TL;
T3f = FNMS(KP707106781, T3e, T3d);
T3V = FMA(KP707106781, T3e, T3d);
TB = Tx + TA;
TM = TG - TL;
TN = FMA(KP707106781, TM, TB);
T2a = FNMS(KP707106781, TM, TB);
}
{
E T1i, T1l, T2E, T2F;
T1i = T1e - T1h;
T1l = T1j + T1k;
T1m = FMA(KP707106781, T1l, T1i);
T2f = FNMS(KP707106781, T1l, T1i);
T2E = T3 - T6;
T2F = T1A - T1z;
T2G = T2E + T2F;
T3G = T2E - T2F;
}
{
E T2R, T2S, T1B, T1E;
T2R = Ta - Td;
T2S = T1C - T1D;
T2T = T2R + T2S;
T3L = T2S - T2R;
T1B = T1z + T1A;
T1E = T1C + T1D;
T1F = T1B + T1E;
T26 = T1E - T1B;
}
}
{
E Ti, T1s, Tl, T1t, TS, TX, T34, T33, T2I, T2H, Tp, T1v, Ts, T1w, T13;
E T18, T37, T36, T2L, T2K;
{
E TT, TR, TO, TW;
{
E Tg, Th, TP, TQ;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
TT = Tg - Th;
TP = Ip[WS(rs, 1)];
TQ = Im[WS(rs, 6)];
TR = TP + TQ;
T1s = TP - TQ;
}
{
E Tj, Tk, TU, TV;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
TO = Tj - Tk;
TU = Ip[WS(rs, 5)];
TV = Im[WS(rs, 2)];
TW = TU + TV;
T1t = TU - TV;
}
TS = TO + TR;
TX = TT - TW;
T34 = TR - TO;
T33 = TT + TW;
T2I = T1s - T1t;
T2H = Ti - Tl;
}
{
E T14, T12, TZ, T17;
{
E Tn, To, T10, T11;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T14 = Tn - To;
T10 = Im[0];
T11 = Ip[WS(rs, 7)];
T12 = T10 + T11;
T1v = T11 - T10;
}
{
E Tq, Tr, T15, T16;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
TZ = Tq - Tr;
T15 = Ip[WS(rs, 3)];
T16 = Im[WS(rs, 4)];
T17 = T15 + T16;
T1w = T15 - T16;
}
T13 = TZ - T12;
T18 = T14 - T17;
T37 = TZ + T12;
T36 = T14 + T17;
T2L = T1v - T1w;
T2K = Tp - Ts;
}
T2J = T2H - T2I;
T2M = T2K + T2L;
T2N = T2J + T2M;
T2U = T2H + T2I;
T2V = T2L - T2K;
T3H = T2V - T2U;
{
E Tm, Tt, T3g, T3h;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T25 = Tm - Tt;
T3g = FNMS(KP414213562, T33, T34);
T3h = FNMS(KP414213562, T36, T37);
T3i = T3g + T3h;
T3R = T3h - T3g;
}
{
E TY, T19, T1u, T1x;
TY = FMA(KP414213562, TX, TS);
T19 = FNMS(KP414213562, T18, T13);
T1a = TY + T19;
T2g = T19 - TY;
T1u = T1s + T1t;
T1x = T1v + T1w;
T1y = T1u + T1x;
T21 = T1x - T1u;
}
{
E T35, T38, T1n, T1o;
T35 = FMA(KP414213562, T34, T33);
T38 = FMA(KP414213562, T37, T36);
T39 = T35 - T38;
T3W = T35 + T38;
T1n = FNMS(KP414213562, TS, TX);
T1o = FMA(KP414213562, T13, T18);
T1p = T1n + T1o;
T2b = T1n - T1o;
}
}
{
E Tv, T1G, T1b, T1q, T1c, T1H, Tw, T1r, T1I, T1d;
Tv = Tf + Tu;
T1G = T1y + T1F;
T1b = FMA(KP923879532, T1a, TN);
T1q = FMA(KP923879532, T1p, T1m);
Tw = W[0];
T1c = Tw * T1b;
T1H = Tw * T1q;
T1d = W[1];
T1r = FMA(T1d, T1q, T1c);
T1I = FNMS(T1d, T1b, T1H);
Rp[0] = Tv - T1r;
Ip[0] = T1G + T1I;
Rm[0] = Tv + T1r;
Im[0] = T1I - T1G;
}
{
E T1N, T1J, T1L, T1M, T1V, T1Q, T1T, T1R, T1X, T1K, T1P;
T1N = T1F - T1y;
T1K = Tf - Tu;
T1J = W[14];
T1L = T1J * T1K;
T1M = W[15];
T1V = T1M * T1K;
T1Q = FNMS(KP923879532, T1a, TN);
T1T = FNMS(KP923879532, T1p, T1m);
T1P = W[16];
T1R = T1P * T1Q;
T1X = T1P * T1T;
{
E T1O, T1W, T1U, T1Y, T1S;
T1O = FNMS(T1M, T1N, T1L);
T1W = FMA(T1J, T1N, T1V);
T1S = W[17];
T1U = FMA(T1S, T1T, T1R);
T1Y = FNMS(T1S, T1Q, T1X);
Rp[WS(rs, 4)] = T1O - T1U;
Ip[WS(rs, 4)] = T1W + T1Y;
Rm[WS(rs, 4)] = T1O + T1U;
Im[WS(rs, 4)] = T1Y - T1W;
}
}
{
E T2r, T2n, T2p, T2q, T2z, T2u, T2x, T2v, T2B, T2o, T2t;
T2r = T26 - T25;
T2o = T20 - T21;
T2n = W[22];
T2p = T2n * T2o;
T2q = W[23];
T2z = T2q * T2o;
T2u = FNMS(KP923879532, T2b, T2a);
T2x = FNMS(KP923879532, T2g, T2f);
T2t = W[24];
T2v = T2t * T2u;
T2B = T2t * T2x;
{
E T2s, T2A, T2y, T2C, T2w;
T2s = FNMS(T2q, T2r, T2p);
T2A = FMA(T2n, T2r, T2z);
T2w = W[25];
T2y = FMA(T2w, T2x, T2v);
T2C = FNMS(T2w, T2u, T2B);
Rp[WS(rs, 6)] = T2s - T2y;
Ip[WS(rs, 6)] = T2A + T2C;
Rm[WS(rs, 6)] = T2s + T2y;
Im[WS(rs, 6)] = T2C - T2A;
}
}
{
E T27, T1Z, T23, T24, T2j, T2c, T2h, T2d, T2l, T22, T29;
T27 = T25 + T26;
T22 = T20 + T21;
T1Z = W[6];
T23 = T1Z * T22;
T24 = W[7];
T2j = T24 * T22;
T2c = FMA(KP923879532, T2b, T2a);
T2h = FMA(KP923879532, T2g, T2f);
T29 = W[8];
T2d = T29 * T2c;
T2l = T29 * T2h;
{
E T28, T2k, T2i, T2m, T2e;
T28 = FNMS(T24, T27, T23);
T2k = FMA(T1Z, T27, T2j);
T2e = W[9];
T2i = FMA(T2e, T2h, T2d);
T2m = FNMS(T2e, T2c, T2l);
Rp[WS(rs, 2)] = T28 - T2i;
Ip[WS(rs, 2)] = T2k + T2m;
Rm[WS(rs, 2)] = T28 + T2i;
Im[WS(rs, 2)] = T2m - T2k;
}
}
{
E T3N, T47, T43, T45, T46, T4f, T3F, T3J, T3K, T3Z, T3S, T3X, T3T, T41, T4a;
E T4d, T4b, T4h;
{
E T3M, T44, T3I, T3P, T49;
T3M = T2J - T2M;
T3N = FMA(KP707106781, T3M, T3L);
T47 = FNMS(KP707106781, T3M, T3L);
T44 = FNMS(KP707106781, T3H, T3G);
T43 = W[26];
T45 = T43 * T44;
T46 = W[27];
T4f = T46 * T44;
T3I = FMA(KP707106781, T3H, T3G);
T3F = W[10];
T3J = T3F * T3I;
T3K = W[11];
T3Z = T3K * T3I;
T3S = FMA(KP923879532, T3R, T3Q);
T3X = FNMS(KP923879532, T3W, T3V);
T3P = W[12];
T3T = T3P * T3S;
T41 = T3P * T3X;
T4a = FNMS(KP923879532, T3R, T3Q);
T4d = FMA(KP923879532, T3W, T3V);
T49 = W[28];
T4b = T49 * T4a;
T4h = T49 * T4d;
}
{
E T3O, T40, T3Y, T42, T3U;
T3O = FNMS(T3K, T3N, T3J);
T40 = FMA(T3F, T3N, T3Z);
T3U = W[13];
T3Y = FMA(T3U, T3X, T3T);
T42 = FNMS(T3U, T3S, T41);
Rp[WS(rs, 3)] = T3O - T3Y;
Ip[WS(rs, 3)] = T40 + T42;
Rm[WS(rs, 3)] = T3O + T3Y;
Im[WS(rs, 3)] = T42 - T40;
}
{
E T48, T4g, T4e, T4i, T4c;
T48 = FNMS(T46, T47, T45);
T4g = FMA(T43, T47, T4f);
T4c = W[29];
T4e = FMA(T4c, T4d, T4b);
T4i = FNMS(T4c, T4a, T4h);
Rp[WS(rs, 7)] = T48 - T4e;
Ip[WS(rs, 7)] = T4g + T4i;
Rm[WS(rs, 7)] = T48 + T4e;
Im[WS(rs, 7)] = T4i - T4g;
}
}
{
E T2X, T3t, T3p, T3r, T3s, T3B, T2D, T2P, T2Q, T3l, T3a, T3j, T3b, T3n, T3w;
E T3z, T3x, T3D;
{
E T2W, T3q, T2O, T2Z, T3v;
T2W = T2U + T2V;
T2X = FMA(KP707106781, T2W, T2T);
T3t = FNMS(KP707106781, T2W, T2T);
T3q = FNMS(KP707106781, T2N, T2G);
T3p = W[18];
T3r = T3p * T3q;
T3s = W[19];
T3B = T3s * T3q;
T2O = FMA(KP707106781, T2N, T2G);
T2D = W[2];
T2P = T2D * T2O;
T2Q = W[3];
T3l = T2Q * T2O;
T3a = FMA(KP923879532, T39, T32);
T3j = FNMS(KP923879532, T3i, T3f);
T2Z = W[4];
T3b = T2Z * T3a;
T3n = T2Z * T3j;
T3w = FNMS(KP923879532, T39, T32);
T3z = FMA(KP923879532, T3i, T3f);
T3v = W[20];
T3x = T3v * T3w;
T3D = T3v * T3z;
}
{
E T2Y, T3m, T3k, T3o, T3c;
T2Y = FNMS(T2Q, T2X, T2P);
T3m = FMA(T2D, T2X, T3l);
T3c = W[5];
T3k = FMA(T3c, T3j, T3b);
T3o = FNMS(T3c, T3a, T3n);
Rp[WS(rs, 1)] = T2Y - T3k;
Ip[WS(rs, 1)] = T3m + T3o;
Rm[WS(rs, 1)] = T2Y + T3k;
Im[WS(rs, 1)] = T3o - T3m;
}
{
E T3u, T3C, T3A, T3E, T3y;
T3u = FNMS(T3s, T3t, T3r);
T3C = FMA(T3p, T3t, T3B);
T3y = W[21];
T3A = FMA(T3y, T3z, T3x);
T3E = FNMS(T3y, T3w, T3D);
Rp[WS(rs, 5)] = T3u - T3A;
Ip[WS(rs, 5)] = T3C + T3E;
Rm[WS(rs, 5)] = T3u + T3A;
Im[WS(rs, 5)] = T3E - T3C;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cbdft2_16", twinstr, &GENUS, { 136, 30, 70, 0 } };
void X(codelet_hc2cbdft2_16) (planner *p) {
X(khc2c_register) (p, hc2cbdft2_16, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft2_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 206 FP additions, 84 FP multiplications,
* (or, 168 additions, 46 multiplications, 38 fused multiply/add),
* 60 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
E T1x, T1V;
{
E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
E T1z;
{
E T1, T2, Ty, Tz;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
T1j = T1 - T2;
Ty = Ip[0];
Tz = Im[WS(rs, 7)];
TA = Ty + Tz;
T1B = Ty - Tz;
}
{
E T4, T5, T1k, T1l;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
Tx = T4 - T5;
T1k = Ip[WS(rs, 4)];
T1l = Im[WS(rs, 3)];
T1m = T1k + T1l;
T1C = T1k - T1l;
}
{
E T8, T9, TD, TE;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
Ta = T8 + T9;
TC = T8 - T9;
TD = Ip[WS(rs, 2)];
TE = Im[WS(rs, 5)];
TF = TD + TE;
T1y = TD - TE;
}
{
E Tb, Tc, TI, TJ;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
Td = Tb + Tc;
TH = Tb - Tc;
TI = Im[WS(rs, 1)];
TJ = Ip[WS(rs, 6)];
TK = TI + TJ;
T1z = TJ - TI;
}
{
E T7, Te, TG, TL;
TB = Tx + TA;
T2L = TA - Tx;
T30 = T1j + T1m;
T1n = T1j - T1m;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T1U = T7 - Te;
{
E T2F, T2G, T1A, T1D;
T2F = Ta - Td;
T2G = T1B - T1C;
T2H = T2F + T2G;
T3p = T2G - T2F;
T1A = T1y + T1z;
T1D = T1B + T1C;
T1E = T1A + T1D;
T1Z = T1D - T1A;
}
TG = TC + TF;
TL = TH + TK;
TM = KP707106781 * (TG - TL);
T31 = KP707106781 * (TG + TL);
{
E T2q, T2r, T1g, T1h;
T2q = T3 - T6;
T2r = T1z - T1y;
T2s = T2q + T2r;
T3k = T2q - T2r;
T1g = TC - TF;
T1h = TH - TK;
T1i = KP707106781 * (T1g + T1h);
T2M = KP707106781 * (T1g - T1h);
}
}
}
{
E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
E T1v;
{
E Tg, Th, TP, TQ;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
TT = Tg - Th;
TP = Ip[WS(rs, 1)];
TQ = Im[WS(rs, 6)];
TR = TP + TQ;
T1r = TP - TQ;
}
{
E Tj, Tk, TU, TV;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
TO = Tj - Tk;
TU = Ip[WS(rs, 5)];
TV = Im[WS(rs, 2)];
TW = TU + TV;
T1s = TU - TV;
}
{
E Tn, To, T10, T11;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T14 = Tn - To;
T10 = Im[0];
T11 = Ip[WS(rs, 7)];
T12 = T10 + T11;
T1u = T11 - T10;
}
{
E Tq, Tr, T15, T16;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
TZ = Tq - Tr;
T15 = Ip[WS(rs, 3)];
T16 = Im[WS(rs, 4)];
T17 = T15 + T16;
T1v = T15 - T16;
}
{
E Tm, Tt, T2O, T2P;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T1Y = Tm - Tt;
T2O = TR - TO;
T2P = TT + TW;
T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
}
{
E T2R, T2S, TS, TX;
T2R = TZ + T12;
T2S = T14 + T17;
T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
TS = TO + TR;
TX = TT - TW;
TY = FMA(KP923879532, TS, KP382683432 * TX);
T1d = FNMS(KP382683432, TS, KP923879532 * TX);
}
{
E T13, T18, T2t, T2u;
T13 = TZ - T12;
T18 = T14 - T17;
T19 = FNMS(KP382683432, T18, KP923879532 * T13);
T1e = FMA(KP382683432, T13, KP923879532 * T18);
T2t = Ti - Tl;
T2u = T1r - T1s;
T2v = T2t - T2u;
T2C = T2t + T2u;
}
{
E T2w, T2x, T1t, T1w;
T2w = Tp - Ts;
T2x = T1u - T1v;
T2y = T2w + T2x;
T2D = T2x - T2w;
T1t = T1r + T1s;
T1w = T1u + T1v;
T1x = T1t + T1w;
T1V = T1w - T1t;
}
}
{
E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
Tv = Tf + Tu;
T1F = T1x + T1E;
{
E TN, T1a, T1f, T1o;
TN = TB + TM;
T1a = TY + T19;
T1b = TN + T1a;
T1N = TN - T1a;
T1f = T1d + T1e;
T1o = T1i + T1n;
T1p = T1f + T1o;
T1P = T1o - T1f;
{
E T1I, T1K, T1H, T1J;
T1I = Tf - Tu;
T1K = T1E - T1x;
T1H = W[14];
T1J = W[15];
T1L = FNMS(T1J, T1K, T1H * T1I);
T1R = FMA(T1J, T1I, T1H * T1K);
}
}
{
E T1q, T1G, Tw, T1c;
Tw = W[0];
T1c = W[1];
T1q = FMA(Tw, T1b, T1c * T1p);
T1G = FNMS(T1c, T1b, Tw * T1p);
Rp[0] = Tv - T1q;
Ip[0] = T1F + T1G;
Rm[0] = Tv + T1q;
Im[0] = T1G - T1F;
}
{
E T1Q, T1S, T1M, T1O;
T1M = W[16];
T1O = W[17];
T1Q = FMA(T1M, T1N, T1O * T1P);
T1S = FNMS(T1O, T1N, T1M * T1P);
Rp[WS(rs, 4)] = T1L - T1Q;
Ip[WS(rs, 4)] = T1R + T1S;
Rm[WS(rs, 4)] = T1L + T1Q;
Im[WS(rs, 4)] = T1S - T1R;
}
}
{
E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
{
E T23, T24, T27, T28;
T23 = TB - TM;
T24 = T1d - T1e;
T25 = T23 + T24;
T2j = T23 - T24;
T27 = T19 - TY;
T28 = T1n - T1i;
T29 = T27 + T28;
T2l = T28 - T27;
}
{
E T1W, T20, T1T, T1X;
T1W = T1U + T1V;
T20 = T1Y + T1Z;
T1T = W[6];
T1X = W[7];
T21 = FNMS(T1X, T20, T1T * T1W);
T2b = FMA(T1X, T1W, T1T * T20);
}
{
E T2e, T2g, T2d, T2f;
T2e = T1U - T1V;
T2g = T1Z - T1Y;
T2d = W[22];
T2f = W[23];
T2h = FNMS(T2f, T2g, T2d * T2e);
T2n = FMA(T2f, T2e, T2d * T2g);
}
{
E T2a, T2c, T22, T26;
T22 = W[8];
T26 = W[9];
T2a = FMA(T22, T25, T26 * T29);
T2c = FNMS(T26, T25, T22 * T29);
Rp[WS(rs, 2)] = T21 - T2a;
Ip[WS(rs, 2)] = T2b + T2c;
Rm[WS(rs, 2)] = T21 + T2a;
Im[WS(rs, 2)] = T2c - T2b;
}
{
E T2m, T2o, T2i, T2k;
T2i = W[24];
T2k = W[25];
T2m = FMA(T2i, T2j, T2k * T2l);
T2o = FNMS(T2k, T2j, T2i * T2l);
Rp[WS(rs, 6)] = T2h - T2m;
Ip[WS(rs, 6)] = T2n + T2o;
Rm[WS(rs, 6)] = T2h + T2m;
Im[WS(rs, 6)] = T2o - T2n;
}
}
{
E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
T2z = KP707106781 * (T2v + T2y);
T2A = T2s + T2z;
T38 = T2s - T2z;
T2E = KP707106781 * (T2C + T2D);
T2I = T2E + T2H;
T3a = T2H - T2E;
{
E T2N, T2U, T2Z, T32;
T2N = T2L + T2M;
T2U = T2Q - T2T;
T2V = T2N + T2U;
T3d = T2N - T2U;
T2Z = T2X + T2Y;
T32 = T30 - T31;
T33 = T2Z + T32;
T3f = T32 - T2Z;
}
{
E T2J, T35, T34, T36;
{
E T2p, T2B, T2K, T2W;
T2p = W[2];
T2B = W[3];
T2J = FNMS(T2B, T2I, T2p * T2A);
T35 = FMA(T2B, T2A, T2p * T2I);
T2K = W[4];
T2W = W[5];
T34 = FMA(T2K, T2V, T2W * T33);
T36 = FNMS(T2W, T2V, T2K * T33);
}
Rp[WS(rs, 1)] = T2J - T34;
Ip[WS(rs, 1)] = T35 + T36;
Rm[WS(rs, 1)] = T2J + T34;
Im[WS(rs, 1)] = T36 - T35;
}
{
E T3b, T3h, T3g, T3i;
{
E T37, T39, T3c, T3e;
T37 = W[18];
T39 = W[19];
T3b = FNMS(T39, T3a, T37 * T38);
T3h = FMA(T39, T38, T37 * T3a);
T3c = W[20];
T3e = W[21];
T3g = FMA(T3c, T3d, T3e * T3f);
T3i = FNMS(T3e, T3d, T3c * T3f);
}
Rp[WS(rs, 5)] = T3b - T3g;
Ip[WS(rs, 5)] = T3h + T3i;
Rm[WS(rs, 5)] = T3b + T3g;
Im[WS(rs, 5)] = T3i - T3h;
}
}
{
E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
T3l = KP707106781 * (T2D - T2C);
T3m = T3k + T3l;
T3E = T3k - T3l;
T3o = KP707106781 * (T2v - T2y);
T3q = T3o + T3p;
T3G = T3p - T3o;
{
E T3t, T3u, T3x, T3y;
T3t = T2L - T2M;
T3u = T2X - T2Y;
T3v = T3t + T3u;
T3J = T3t - T3u;
T3x = T31 + T30;
T3y = T2Q + T2T;
T3z = T3x - T3y;
T3L = T3y + T3x;
}
{
E T3r, T3B, T3A, T3C;
{
E T3j, T3n, T3s, T3w;
T3j = W[10];
T3n = W[11];
T3r = FNMS(T3n, T3q, T3j * T3m);
T3B = FMA(T3n, T3m, T3j * T3q);
T3s = W[12];
T3w = W[13];
T3A = FMA(T3s, T3v, T3w * T3z);
T3C = FNMS(T3w, T3v, T3s * T3z);
}
Rp[WS(rs, 3)] = T3r - T3A;
Ip[WS(rs, 3)] = T3B + T3C;
Rm[WS(rs, 3)] = T3r + T3A;
Im[WS(rs, 3)] = T3C - T3B;
}
{
E T3H, T3N, T3M, T3O;
{
E T3D, T3F, T3I, T3K;
T3D = W[26];
T3F = W[27];
T3H = FNMS(T3F, T3G, T3D * T3E);
T3N = FMA(T3F, T3E, T3D * T3G);
T3I = W[28];
T3K = W[29];
T3M = FMA(T3I, T3J, T3K * T3L);
T3O = FNMS(T3K, T3J, T3I * T3L);
}
Rp[WS(rs, 7)] = T3H - T3M;
Ip[WS(rs, 7)] = T3N + T3O;
Rm[WS(rs, 7)] = T3H + T3M;
Im[WS(rs, 7)] = T3O - T3N;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cbdft2_16", twinstr, &GENUS, { 168, 46, 38, 0 } };
void X(codelet_hc2cbdft2_16) (planner *p) {
X(khc2c_register) (p, hc2cbdft2_16, &desc, HC2C_VIA_DFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,218 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:14 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include rdft/scalar/hc2cb.h */
/*
* This function contains 30 FP additions, 12 FP multiplications,
* (or, 24 additions, 6 multiplications, 6 fused multiply/add),
* 23 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T3, Tm, T6, Tn, Td, Tk, TB, Ty, Tv, Ts;
{
E Tg, Tc, T9, Tj;
{
E T1, T2, Ta, Tb;
T1 = Rp[0];
T2 = Rm[WS(rs, 1)];
T3 = T1 + T2;
Tg = T1 - T2;
Ta = Ip[0];
Tb = Im[WS(rs, 1)];
Tc = Ta + Tb;
Tm = Ta - Tb;
}
{
E T4, T5, Th, Ti;
T4 = Rp[WS(rs, 1)];
T5 = Rm[0];
T6 = T4 + T5;
T9 = T4 - T5;
Th = Ip[WS(rs, 1)];
Ti = Im[0];
Tj = Th + Ti;
Tn = Th - Ti;
}
Td = T9 + Tc;
Tk = Tg - Tj;
TB = Tg + Tj;
Ty = Tc - T9;
Tv = Tm - Tn;
Ts = T3 - T6;
}
{
E T7, To, Te, Tp, T8, Tl, Tq, Tf;
T7 = T3 + T6;
To = Tm + Tn;
T8 = W[0];
Te = T8 * Td;
Tp = T8 * Tk;
Tf = W[1];
Tl = FMA(Tf, Tk, Te);
Tq = FNMS(Tf, Td, Tp);
Rp[0] = T7 - Tl;
Ip[0] = To + Tq;
Rm[0] = T7 + Tl;
Im[0] = Tq - To;
}
{
E Tr, Tt, Tu, TD, Tz, TF, Tx;
Tr = W[2];
Tt = Tr * Ts;
Tu = W[3];
TD = Tu * Ts;
Tx = W[4];
Tz = Tx * Ty;
TF = Tx * TB;
{
E Tw, TE, TC, TG, TA;
Tw = FNMS(Tu, Tv, Tt);
TE = FMA(Tr, Tv, TD);
TA = W[5];
TC = FMA(TA, TB, Tz);
TG = FNMS(TA, Ty, TF);
Rp[WS(rs, 1)] = Tw - TC;
Ip[WS(rs, 1)] = TE + TG;
Rm[WS(rs, 1)] = Tw + TC;
Im[WS(rs, 1)] = TG - TE;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, { 24, 6, 6, 0 } };
void X(codelet_hc2cbdft2_4) (planner *p) {
X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include rdft/scalar/hc2cb.h */
/*
* This function contains 30 FP additions, 12 FP multiplications,
* (or, 24 additions, 6 multiplications, 6 fused multiply/add),
* 19 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T3, Tl, T6, Tm, Td, Tj, Tx, Tv, Ts, Tq;
{
E Tf, Tc, T9, Ti;
{
E T1, T2, Ta, Tb;
T1 = Rp[0];
T2 = Rm[WS(rs, 1)];
T3 = T1 + T2;
Tf = T1 - T2;
Ta = Ip[0];
Tb = Im[WS(rs, 1)];
Tc = Ta + Tb;
Tl = Ta - Tb;
}
{
E T4, T5, Tg, Th;
T4 = Rp[WS(rs, 1)];
T5 = Rm[0];
T6 = T4 + T5;
T9 = T4 - T5;
Tg = Ip[WS(rs, 1)];
Th = Im[0];
Ti = Tg + Th;
Tm = Tg - Th;
}
Td = T9 + Tc;
Tj = Tf - Ti;
Tx = Tf + Ti;
Tv = Tc - T9;
Ts = Tl - Tm;
Tq = T3 - T6;
}
{
E T7, Tn, Tk, To, T8, Te;
T7 = T3 + T6;
Tn = Tl + Tm;
T8 = W[0];
Te = W[1];
Tk = FMA(T8, Td, Te * Tj);
To = FNMS(Te, Td, T8 * Tj);
Rp[0] = T7 - Tk;
Ip[0] = Tn + To;
Rm[0] = T7 + Tk;
Im[0] = To - Tn;
}
{
E Tt, Tz, Ty, TA;
{
E Tp, Tr, Tu, Tw;
Tp = W[2];
Tr = W[3];
Tt = FNMS(Tr, Ts, Tp * Tq);
Tz = FMA(Tr, Tq, Tp * Ts);
Tu = W[4];
Tw = W[5];
Ty = FMA(Tu, Tv, Tw * Tx);
TA = FNMS(Tw, Tv, Tu * Tx);
}
Rp[WS(rs, 1)] = Tt - Ty;
Ip[WS(rs, 1)] = Tz + TA;
Rm[WS(rs, 1)] = Tt + Ty;
Im[WS(rs, 1)] = TA - Tz;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, { 24, 6, 6, 0 } };
void X(codelet_hc2cbdft2_4) (planner *p) {
X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,424 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:14 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include rdft/scalar/hc2cb.h */
/*
* This function contains 82 FP additions, 36 FP multiplications,
* (or, 60 additions, 14 multiplications, 22 fused multiply/add),
* 41 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E Tl, T1p, T1g, TM, T1k, TE, TP, T1f, T7, Te, TU, TH, T1l, Tw, T1q;
E T1c, T1y;
{
E T3, TA, Tk, TN, T6, Th, TD, TO, Ta, Tm, Tp, TK, Td, Tr, Tu;
E TL, TF, TG;
{
E T1, T2, Ti, Tj;
T1 = Rp[0];
T2 = Rm[WS(rs, 3)];
T3 = T1 + T2;
TA = T1 - T2;
Ti = Ip[0];
Tj = Im[WS(rs, 3)];
Tk = Ti + Tj;
TN = Ti - Tj;
}
{
E T4, T5, TB, TC;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 1)];
T6 = T4 + T5;
Th = T4 - T5;
TB = Ip[WS(rs, 2)];
TC = Im[WS(rs, 1)];
TD = TB + TC;
TO = TB - TC;
}
{
E T8, T9, Tn, To;
T8 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 2)];
Ta = T8 + T9;
Tm = T8 - T9;
Tn = Ip[WS(rs, 1)];
To = Im[WS(rs, 2)];
Tp = Tn + To;
TK = Tn - To;
}
{
E Tb, Tc, Ts, Tt;
Tb = Rm[0];
Tc = Rp[WS(rs, 3)];
Td = Tb + Tc;
Tr = Tb - Tc;
Ts = Im[0];
Tt = Ip[WS(rs, 3)];
Tu = Ts + Tt;
TL = Tt - Ts;
}
Tl = Th + Tk;
T1p = TA + TD;
T1g = TN - TO;
TM = TK + TL;
T1k = Tk - Th;
TE = TA - TD;
TP = TN + TO;
T1f = Ta - Td;
T7 = T3 + T6;
Te = Ta + Td;
TU = T7 - Te;
TF = Tm - Tp;
TG = Tr - Tu;
TH = TF + TG;
T1l = TF - TG;
{
E Tq, Tv, T1a, T1b;
Tq = Tm + Tp;
Tv = Tr + Tu;
Tw = Tq - Tv;
T1q = Tq + Tv;
T1a = T3 - T6;
T1b = TL - TK;
T1c = T1a + T1b;
T1y = T1a - T1b;
}
}
{
E Tf, TQ, Tx, TI, Ty, TR, Tg, TJ, TS, Tz;
Tf = T7 + Te;
TQ = TM + TP;
Tx = FMA(KP707106781, Tw, Tl);
TI = FMA(KP707106781, TH, TE);
Tg = W[0];
Ty = Tg * Tx;
TR = Tg * TI;
Tz = W[1];
TJ = FMA(Tz, TI, Ty);
TS = FNMS(Tz, Tx, TR);
Rp[0] = Tf - TJ;
Ip[0] = TQ + TS;
Rm[0] = Tf + TJ;
Im[0] = TS - TQ;
}
{
E T1B, T1A, T1J, T1x, T1z, T1E, T1H, T1F, T1L, T1D;
T1B = T1g - T1f;
T1A = W[11];
T1J = T1A * T1y;
T1x = W[10];
T1z = T1x * T1y;
T1E = FNMS(KP707106781, T1l, T1k);
T1H = FMA(KP707106781, T1q, T1p);
T1D = W[12];
T1F = T1D * T1E;
T1L = T1D * T1H;
{
E T1C, T1K, T1I, T1M, T1G;
T1C = FNMS(T1A, T1B, T1z);
T1K = FMA(T1x, T1B, T1J);
T1G = W[13];
T1I = FMA(T1G, T1H, T1F);
T1M = FNMS(T1G, T1E, T1L);
Rp[WS(rs, 3)] = T1C - T1I;
Ip[WS(rs, 3)] = T1K + T1M;
Rm[WS(rs, 3)] = T1C + T1I;
Im[WS(rs, 3)] = T1M - T1K;
}
}
{
E TX, TW, T15, TT, TV, T10, T13, T11, T17, TZ;
TX = TP - TM;
TW = W[7];
T15 = TW * TU;
TT = W[6];
TV = TT * TU;
T10 = FNMS(KP707106781, Tw, Tl);
T13 = FNMS(KP707106781, TH, TE);
TZ = W[8];
T11 = TZ * T10;
T17 = TZ * T13;
{
E TY, T16, T14, T18, T12;
TY = FNMS(TW, TX, TV);
T16 = FMA(TT, TX, T15);
T12 = W[9];
T14 = FMA(T12, T13, T11);
T18 = FNMS(T12, T10, T17);
Rp[WS(rs, 2)] = TY - T14;
Ip[WS(rs, 2)] = T16 + T18;
Rm[WS(rs, 2)] = TY + T14;
Im[WS(rs, 2)] = T18 - T16;
}
}
{
E T1h, T1e, T1t, T19, T1d, T1m, T1r, T1n, T1v, T1j;
T1h = T1f + T1g;
T1e = W[3];
T1t = T1e * T1c;
T19 = W[2];
T1d = T19 * T1c;
T1m = FMA(KP707106781, T1l, T1k);
T1r = FNMS(KP707106781, T1q, T1p);
T1j = W[4];
T1n = T1j * T1m;
T1v = T1j * T1r;
{
E T1i, T1u, T1s, T1w, T1o;
T1i = FNMS(T1e, T1h, T1d);
T1u = FMA(T19, T1h, T1t);
T1o = W[5];
T1s = FMA(T1o, T1r, T1n);
T1w = FNMS(T1o, T1m, T1v);
Rp[WS(rs, 1)] = T1i - T1s;
Ip[WS(rs, 1)] = T1u + T1w;
Rm[WS(rs, 1)] = T1i + T1s;
Im[WS(rs, 1)] = T1w - T1u;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, { 60, 14, 22, 0 } };
void X(codelet_hc2cbdft2_8) (planner *p) {
X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include rdft/scalar/hc2cb.h */
/*
* This function contains 82 FP additions, 32 FP multiplications,
* (or, 68 additions, 18 multiplications, 14 fused multiply/add),
* 30 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw;
E T1i;
{
E T3, TC, Tk, TM, T6, Th, TF, TN;
{
E T1, T2, Ti, Tj;
T1 = Rp[0];
T2 = Rm[WS(rs, 3)];
T3 = T1 + T2;
TC = T1 - T2;
Ti = Ip[0];
Tj = Im[WS(rs, 3)];
Tk = Ti + Tj;
TM = Ti - Tj;
}
{
E T4, T5, TD, TE;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 1)];
T6 = T4 + T5;
Th = T4 - T5;
TD = Ip[WS(rs, 2)];
TE = Im[WS(rs, 1)];
TF = TD + TE;
TN = TD - TE;
}
T7 = T3 + T6;
T1d = Tk - Th;
T1h = TC + TF;
Tl = Th + Tk;
TG = TC - TF;
T14 = T3 - T6;
T19 = TM - TN;
TO = TM + TN;
}
{
E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK;
{
E T8, T9, Tn, To;
T8 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 2)];
Ta = T8 + T9;
Tm = T8 - T9;
Tn = Ip[WS(rs, 1)];
To = Im[WS(rs, 2)];
Tp = Tn + To;
TJ = Tn - To;
}
{
E Tb, Tc, Ts, Tt;
Tb = Rm[0];
Tc = Rp[WS(rs, 3)];
Td = Tb + Tc;
Tr = Tb - Tc;
Ts = Im[0];
Tt = Ip[WS(rs, 3)];
Tu = Ts + Tt;
TK = Tt - Ts;
}
Te = Ta + Td;
TL = TJ + TK;
T18 = Ta - Td;
T15 = TK - TJ;
{
E Tz, TA, Tq, Tv;
Tz = Tm - Tp;
TA = Tr - Tu;
TB = KP707106781 * (Tz + TA);
T1e = KP707106781 * (Tz - TA);
Tq = Tm + Tp;
Tv = Tr + Tu;
Tw = KP707106781 * (Tq - Tv);
T1i = KP707106781 * (Tq + Tv);
}
}
{
E Tf, TP, TI, TQ;
Tf = T7 + Te;
TP = TL + TO;
{
E Tx, TH, Tg, Ty;
Tx = Tl + Tw;
TH = TB + TG;
Tg = W[0];
Ty = W[1];
TI = FMA(Tg, Tx, Ty * TH);
TQ = FNMS(Ty, Tx, Tg * TH);
}
Rp[0] = Tf - TI;
Ip[0] = TP + TQ;
Rm[0] = Tf + TI;
Im[0] = TQ - TP;
}
{
E T1r, T1x, T1w, T1y;
{
E T1o, T1q, T1n, T1p;
T1o = T14 - T15;
T1q = T19 - T18;
T1n = W[10];
T1p = W[11];
T1r = FNMS(T1p, T1q, T1n * T1o);
T1x = FMA(T1p, T1o, T1n * T1q);
}
{
E T1t, T1v, T1s, T1u;
T1t = T1d - T1e;
T1v = T1i + T1h;
T1s = W[12];
T1u = W[13];
T1w = FMA(T1s, T1t, T1u * T1v);
T1y = FNMS(T1u, T1t, T1s * T1v);
}
Rp[WS(rs, 3)] = T1r - T1w;
Ip[WS(rs, 3)] = T1x + T1y;
Rm[WS(rs, 3)] = T1r + T1w;
Im[WS(rs, 3)] = T1y - T1x;
}
{
E TV, T11, T10, T12;
{
E TS, TU, TR, TT;
TS = T7 - Te;
TU = TO - TL;
TR = W[6];
TT = W[7];
TV = FNMS(TT, TU, TR * TS);
T11 = FMA(TT, TS, TR * TU);
}
{
E TX, TZ, TW, TY;
TX = Tl - Tw;
TZ = TG - TB;
TW = W[8];
TY = W[9];
T10 = FMA(TW, TX, TY * TZ);
T12 = FNMS(TY, TX, TW * TZ);
}
Rp[WS(rs, 2)] = TV - T10;
Ip[WS(rs, 2)] = T11 + T12;
Rm[WS(rs, 2)] = TV + T10;
Im[WS(rs, 2)] = T12 - T11;
}
{
E T1b, T1l, T1k, T1m;
{
E T16, T1a, T13, T17;
T16 = T14 + T15;
T1a = T18 + T19;
T13 = W[2];
T17 = W[3];
T1b = FNMS(T17, T1a, T13 * T16);
T1l = FMA(T17, T16, T13 * T1a);
}
{
E T1f, T1j, T1c, T1g;
T1f = T1d + T1e;
T1j = T1h - T1i;
T1c = W[4];
T1g = W[5];
T1k = FMA(T1c, T1f, T1g * T1j);
T1m = FNMS(T1g, T1f, T1c * T1j);
}
Rp[WS(rs, 1)] = T1b - T1k;
Ip[WS(rs, 1)] = T1l + T1m;
Rm[WS(rs, 1)] = T1b + T1k;
Im[WS(rs, 1)] = T1m - T1l;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, { 68, 18, 14, 0 } };
void X(codelet_hc2cbdft2_8) (planner *p) {
X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,545 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include rdft/scalar/hc2cb.h */
/*
* This function contains 122 FP additions, 72 FP multiplications,
* (or, 68 additions, 18 multiplications, 54 fused multiply/add),
* 91 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E T3, Tl, Tu, T14, Ti, T13, Ts, Tt, T1p, T23, TZ, T1z, TQ, T1g, TV;
E T1l, TT, TU, T1j, T1k, T1c, T1Y, TK, T1u;
{
E Td, Tp, Tg, Tq, Th, Tr, T6, Tm, T9, Tn, Ta, To, T1, T2;
T1 = Rp[0];
T2 = Rm[WS(rs, 4)];
T3 = T1 + T2;
Tl = T1 - T2;
{
E Tb, Tc, Te, Tf;
Tb = Rp[WS(rs, 4)];
Tc = Rm[0];
Td = Tb + Tc;
Tp = Tb - Tc;
Te = Rm[WS(rs, 3)];
Tf = Rp[WS(rs, 1)];
Tg = Te + Tf;
Tq = Te - Tf;
}
Th = Td + Tg;
Tr = Tp + Tq;
{
E T4, T5, T7, T8;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 2)];
T6 = T4 + T5;
Tm = T4 - T5;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 3)];
T9 = T7 + T8;
Tn = T7 - T8;
}
Ta = T6 + T9;
To = Tm + Tn;
Tu = To - Tr;
T14 = Ta - Th;
Ti = Ta + Th;
T13 = FNMS(KP250000000, Ti, T3);
Ts = To + Tr;
Tt = FNMS(KP250000000, Ts, Tl);
{
E T1n, T1o, TX, TY;
T1n = Td - Tg;
T1o = T6 - T9;
T1p = FNMS(KP618033988, T1o, T1n);
T23 = FMA(KP618033988, T1n, T1o);
TX = Tm - Tn;
TY = Tp - Tq;
TZ = FMA(KP618033988, TY, TX);
T1z = FNMS(KP618033988, TX, TY);
}
}
{
E TF, T16, TI, T17, TS, T1i, Ty, T19, TB, T1a, TR, T1h, TO, TP;
TO = Ip[0];
TP = Im[WS(rs, 4)];
TQ = TO + TP;
T1g = TO - TP;
{
E TD, TE, TG, TH;
TD = Ip[WS(rs, 4)];
TE = Im[0];
TF = TD + TE;
T16 = TD - TE;
TG = Im[WS(rs, 3)];
TH = Ip[WS(rs, 1)];
TI = TG + TH;
T17 = TH - TG;
}
TS = TF - TI;
T1i = T16 + T17;
{
E Tw, Tx, Tz, TA;
Tw = Ip[WS(rs, 2)];
Tx = Im[WS(rs, 2)];
Ty = Tw + Tx;
T19 = Tw - Tx;
Tz = Im[WS(rs, 1)];
TA = Ip[WS(rs, 3)];
TB = Tz + TA;
T1a = TA - Tz;
}
TR = Ty - TB;
T1h = T19 + T1a;
TV = TR - TS;
T1l = T1h - T1i;
TT = TR + TS;
TU = FNMS(KP250000000, TT, TQ);
T1j = T1h + T1i;
T1k = FNMS(KP250000000, T1j, T1g);
{
E T18, T1b, TC, TJ;
T18 = T16 - T17;
T1b = T19 - T1a;
T1c = FNMS(KP618033988, T1b, T18);
T1Y = FMA(KP618033988, T18, T1b);
TC = Ty + TB;
TJ = TF + TI;
TK = FMA(KP618033988, TJ, TC);
T1u = FNMS(KP618033988, TC, TJ);
}
}
{
E Tj, T2y, T2a, T1A, T2q, T10, T1Q, T24, T2k, T1q, T1K, T26, T28, T29, T2c;
E Tk, TM, TN, T2w, T1M, T1O, T1P, T1S, T1s, T1w, T1x, T1C, T2m, T2o, T2p;
E T2s, T12, T1e, T1f, T1E, T1G, T1I, T1J, T1U, T1W, T20, T21, T2e, T2g, T2i;
E T2j, T2u, T1y, TW, T22, T2l, T2r;
Tj = T3 + Ti;
T2y = T1g + T1j;
T2a = TQ + TT;
T1y = FNMS(KP559016994, TV, TU);
T1A = FMA(KP951056516, T1z, T1y);
T2q = FNMS(KP951056516, T1z, T1y);
TW = FMA(KP559016994, TV, TU);
T10 = FMA(KP951056516, TZ, TW);
T1Q = FNMS(KP951056516, TZ, TW);
T22 = FMA(KP559016994, T1l, T1k);
T24 = FNMS(KP951056516, T23, T22);
T2k = FMA(KP951056516, T23, T22);
{
E T1m, T1v, T2n, T1t;
T1m = FNMS(KP559016994, T1l, T1k);
T1q = FNMS(KP951056516, T1p, T1m);
T1K = FMA(KP951056516, T1p, T1m);
{
E T27, TL, T1N, Tv;
T27 = Tl + Ts;
T26 = W[9];
T28 = T26 * T27;
T29 = W[8];
T2c = T29 * T27;
Tv = FMA(KP559016994, Tu, Tt);
TL = FNMS(KP951056516, TK, Tv);
T1N = FMA(KP951056516, TK, Tv);
Tk = W[1];
TM = Tk * TL;
TN = W[0];
T2w = TN * TL;
T1M = W[17];
T1O = T1M * T1N;
T1P = W[16];
T1S = T1P * T1N;
}
T1t = FNMS(KP559016994, Tu, Tt);
T1v = FNMS(KP951056516, T1u, T1t);
T2n = FMA(KP951056516, T1u, T1t);
T1s = W[5];
T1w = T1s * T1v;
T1x = W[4];
T1C = T1x * T1v;
T2m = W[13];
T2o = T2m * T2n;
T2p = W[12];
T2s = T2p * T2n;
{
E T1d, T1H, T15, T1Z, T2h, T1X;
T15 = FNMS(KP559016994, T14, T13);
T1d = FMA(KP951056516, T1c, T15);
T1H = FNMS(KP951056516, T1c, T15);
T12 = W[2];
T1e = T12 * T1d;
T1f = W[3];
T1E = T1f * T1d;
T1G = W[14];
T1I = T1G * T1H;
T1J = W[15];
T1U = T1J * T1H;
T1X = FMA(KP559016994, T14, T13);
T1Z = FMA(KP951056516, T1Y, T1X);
T2h = FNMS(KP951056516, T1Y, T1X);
T1W = W[6];
T20 = T1W * T1Z;
T21 = W[7];
T2e = T21 * T1Z;
T2g = W[10];
T2i = T2g * T2h;
T2j = W[11];
T2u = T2j * T2h;
}
}
{
E T11, T2x, T1r, T1B;
T11 = FMA(TN, T10, TM);
Rp[0] = Tj - T11;
Rm[0] = Tj + T11;
T2x = FNMS(Tk, T10, T2w);
Im[0] = T2x - T2y;
Ip[0] = T2x + T2y;
T1r = FNMS(T1f, T1q, T1e);
T1B = FMA(T1x, T1A, T1w);
Rp[WS(rs, 1)] = T1r - T1B;
Rm[WS(rs, 1)] = T1B + T1r;
{
E T1D, T1F, T1L, T1R;
T1D = FNMS(T1s, T1A, T1C);
T1F = FMA(T12, T1q, T1E);
Im[WS(rs, 1)] = T1D - T1F;
Ip[WS(rs, 1)] = T1D + T1F;
T1L = FNMS(T1J, T1K, T1I);
T1R = FMA(T1P, T1Q, T1O);
Rp[WS(rs, 4)] = T1L - T1R;
Rm[WS(rs, 4)] = T1R + T1L;
}
}
{
E T1T, T1V, T2t, T2v;
T1T = FNMS(T1M, T1Q, T1S);
T1V = FMA(T1G, T1K, T1U);
Im[WS(rs, 4)] = T1T - T1V;
Ip[WS(rs, 4)] = T1T + T1V;
T2t = FNMS(T2m, T2q, T2s);
T2v = FMA(T2g, T2k, T2u);
Im[WS(rs, 3)] = T2t - T2v;
Ip[WS(rs, 3)] = T2t + T2v;
}
T2l = FNMS(T2j, T2k, T2i);
T2r = FMA(T2p, T2q, T2o);
Rp[WS(rs, 3)] = T2l - T2r;
Rm[WS(rs, 3)] = T2r + T2l;
{
E T25, T2b, T2d, T2f;
T25 = FNMS(T21, T24, T20);
T2b = FMA(T29, T2a, T28);
Rp[WS(rs, 2)] = T25 - T2b;
Rm[WS(rs, 2)] = T2b + T25;
T2d = FNMS(T26, T2a, T2c);
T2f = FMA(T1W, T24, T2e);
Im[WS(rs, 2)] = T2d - T2f;
Ip[WS(rs, 2)] = T2d + T2f;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, { 68, 18, 54, 0 } };
void X(codelet_hc2cbdft_10) (planner *p) {
X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include rdft/scalar/hc2cb.h */
/*
* This function contains 122 FP additions, 60 FP multiplications,
* (or, 92 additions, 30 multiplications, 30 fused multiply/add),
* 61 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E T3, TS, TR, T13, Ti, T12, TT, TU, T1g, T1T, Tr, T1s, TJ, T1h, TG;
E T1m, TK, TL, T1k, T1l, T1b, T1P, TY, T1w;
{
E Td, To, Tg, Tp, Th, TQ, T6, Tl, T9, Tm, Ta, TP, T1, T2;
T1 = Rp[0];
T2 = Rm[WS(rs, 4)];
T3 = T1 + T2;
TS = T1 - T2;
{
E Tb, Tc, Te, Tf;
Tb = Rp[WS(rs, 4)];
Tc = Rm[0];
Td = Tb + Tc;
To = Tb - Tc;
Te = Rm[WS(rs, 3)];
Tf = Rp[WS(rs, 1)];
Tg = Te + Tf;
Tp = Te - Tf;
}
Th = Td + Tg;
TQ = To + Tp;
{
E T4, T5, T7, T8;
T4 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 2)];
T6 = T4 + T5;
Tl = T4 - T5;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 3)];
T9 = T7 + T8;
Tm = T7 - T8;
}
Ta = T6 + T9;
TP = Tl + Tm;
TR = KP559016994 * (TP - TQ);
T13 = KP559016994 * (Ta - Th);
Ti = Ta + Th;
T12 = FNMS(KP250000000, Ti, T3);
TT = TP + TQ;
TU = FNMS(KP250000000, TT, TS);
{
E T1e, T1f, Tn, Tq;
T1e = T6 - T9;
T1f = Td - Tg;
T1g = FNMS(KP951056516, T1f, KP587785252 * T1e);
T1T = FMA(KP951056516, T1e, KP587785252 * T1f);
Tn = Tl - Tm;
Tq = To - Tp;
Tr = FMA(KP951056516, Tn, KP587785252 * Tq);
T1s = FNMS(KP951056516, Tq, KP587785252 * Tn);
}
}
{
E TB, T18, TE, T19, TF, T1j, Tu, T15, Tx, T16, Ty, T1i, TH, TI;
TH = Ip[0];
TI = Im[WS(rs, 4)];
TJ = TH + TI;
T1h = TH - TI;
{
E Tz, TA, TC, TD;
Tz = Ip[WS(rs, 4)];
TA = Im[0];
TB = Tz + TA;
T18 = Tz - TA;
TC = Im[WS(rs, 3)];
TD = Ip[WS(rs, 1)];
TE = TC + TD;
T19 = TD - TC;
}
TF = TB - TE;
T1j = T18 + T19;
{
E Ts, Tt, Tv, Tw;
Ts = Ip[WS(rs, 2)];
Tt = Im[WS(rs, 2)];
Tu = Ts + Tt;
T15 = Ts - Tt;
Tv = Im[WS(rs, 1)];
Tw = Ip[WS(rs, 3)];
Tx = Tv + Tw;
T16 = Tw - Tv;
}
Ty = Tu - Tx;
T1i = T15 + T16;
TG = KP559016994 * (Ty - TF);
T1m = KP559016994 * (T1i - T1j);
TK = Ty + TF;
TL = FNMS(KP250000000, TK, TJ);
T1k = T1i + T1j;
T1l = FNMS(KP250000000, T1k, T1h);
{
E T17, T1a, TW, TX;
T17 = T15 - T16;
T1a = T18 - T19;
T1b = FNMS(KP951056516, T1a, KP587785252 * T17);
T1P = FMA(KP951056516, T17, KP587785252 * T1a);
TW = Tu + Tx;
TX = TB + TE;
TY = FMA(KP951056516, TW, KP587785252 * TX);
T1w = FNMS(KP951056516, TX, KP587785252 * TW);
}
}
{
E Tj, T2g, TN, T1H, T1U, T26, TZ, T1J, T1Q, T24, T1c, T1C, T1t, T29, T1o;
E T1E, T1x, T2b, T20, T21, TM, T1S, TV;
Tj = T3 + Ti;
T2g = T1h + T1k;
TM = TG + TL;
TN = Tr + TM;
T1H = TM - Tr;
T1S = T1m + T1l;
T1U = T1S - T1T;
T26 = T1T + T1S;
TV = TR + TU;
TZ = TV - TY;
T1J = TV + TY;
{
E T1O, T14, T1r, T1n, T1v;
T1O = T13 + T12;
T1Q = T1O + T1P;
T24 = T1O - T1P;
T14 = T12 - T13;
T1c = T14 - T1b;
T1C = T14 + T1b;
T1r = TL - TG;
T1t = T1r - T1s;
T29 = T1s + T1r;
T1n = T1l - T1m;
T1o = T1g + T1n;
T1E = T1n - T1g;
T1v = TU - TR;
T1x = T1v + T1w;
T2b = T1v - T1w;
{
E T1X, T1Z, T1W, T1Y;
T1X = TS + TT;
T1Z = TJ + TK;
T1W = W[9];
T1Y = W[8];
T20 = FMA(T1W, T1X, T1Y * T1Z);
T21 = FNMS(T1W, T1Z, T1Y * T1X);
}
}
{
E T10, T2f, Tk, TO;
Tk = W[0];
TO = W[1];
T10 = FMA(Tk, TN, TO * TZ);
T2f = FNMS(TO, TN, Tk * TZ);
Rp[0] = Tj - T10;
Ip[0] = T2f + T2g;
Rm[0] = Tj + T10;
Im[0] = T2f - T2g;
}
{
E T1V, T22, T1N, T1R;
T1N = W[6];
T1R = W[7];
T1V = FNMS(T1R, T1U, T1N * T1Q);
T22 = FMA(T1R, T1Q, T1N * T1U);
Rp[WS(rs, 2)] = T1V - T20;
Ip[WS(rs, 2)] = T21 + T22;
Rm[WS(rs, 2)] = T20 + T1V;
Im[WS(rs, 2)] = T21 - T22;
}
{
E T1p, T1A, T1y, T1z;
{
E T11, T1d, T1q, T1u;
T11 = W[2];
T1d = W[3];
T1p = FNMS(T1d, T1o, T11 * T1c);
T1A = FMA(T1d, T1c, T11 * T1o);
T1q = W[4];
T1u = W[5];
T1y = FMA(T1q, T1t, T1u * T1x);
T1z = FNMS(T1u, T1t, T1q * T1x);
}
Rp[WS(rs, 1)] = T1p - T1y;
Ip[WS(rs, 1)] = T1z + T1A;
Rm[WS(rs, 1)] = T1y + T1p;
Im[WS(rs, 1)] = T1z - T1A;
}
{
E T1F, T1M, T1K, T1L;
{
E T1B, T1D, T1G, T1I;
T1B = W[14];
T1D = W[15];
T1F = FNMS(T1D, T1E, T1B * T1C);
T1M = FMA(T1D, T1C, T1B * T1E);
T1G = W[16];
T1I = W[17];
T1K = FMA(T1G, T1H, T1I * T1J);
T1L = FNMS(T1I, T1H, T1G * T1J);
}
Rp[WS(rs, 4)] = T1F - T1K;
Ip[WS(rs, 4)] = T1L + T1M;
Rm[WS(rs, 4)] = T1K + T1F;
Im[WS(rs, 4)] = T1L - T1M;
}
{
E T27, T2e, T2c, T2d;
{
E T23, T25, T28, T2a;
T23 = W[10];
T25 = W[11];
T27 = FNMS(T25, T26, T23 * T24);
T2e = FMA(T25, T24, T23 * T26);
T28 = W[12];
T2a = W[13];
T2c = FMA(T28, T29, T2a * T2b);
T2d = FNMS(T2a, T29, T28 * T2b);
}
Rp[WS(rs, 3)] = T27 - T2c;
Ip[WS(rs, 3)] = T2d + T2e;
Rm[WS(rs, 3)] = T2c + T27;
Im[WS(rs, 3)] = T2d - T2e;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, { 92, 30, 30, 0 } };
void X(codelet_hc2cbdft_10) (planner *p) {
X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,643 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include rdft/scalar/hc2cb.h */
/*
* This function contains 142 FP additions, 68 FP multiplications,
* (or, 96 additions, 22 multiplications, 46 fused multiply/add),
* 55 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E Tv, TC, TD, T1L, T1M, T2y, Tb, T1Z, T1E, T2D, T1e, T1U, TY, T2o, T13;
E T18, T19, T1O, T1P, T2E, Tm, T1V, T1H, T2z, T1h, T20, TO, T2p;
{
E T1, T4, Tu, TS, Tp, Ts, Tt, TT, T6, T9, TB, TV, Tw, Tz, TA;
E TW;
{
E T2, T3, Tq, Tr;
T1 = Rp[0];
T2 = Rp[WS(rs, 4)];
T3 = Rm[WS(rs, 3)];
T4 = T2 + T3;
Tu = T2 - T3;
TS = FNMS(KP500000000, T4, T1);
Tp = Ip[0];
Tq = Ip[WS(rs, 4)];
Tr = Im[WS(rs, 3)];
Ts = Tq - Tr;
Tt = FNMS(KP500000000, Ts, Tp);
TT = Tr + Tq;
}
{
E T7, T8, Tx, Ty;
T6 = Rm[WS(rs, 5)];
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 2)];
T9 = T7 + T8;
TB = T7 - T8;
TV = FNMS(KP500000000, T9, T6);
Tw = Im[WS(rs, 5)];
Tx = Im[WS(rs, 1)];
Ty = Ip[WS(rs, 2)];
Tz = Tx - Ty;
TA = FNMS(KP500000000, Tz, Tw);
TW = Tx + Ty;
}
{
E T5, Ta, T1C, T1D;
Tv = FMA(KP866025403, Tu, Tt);
TC = FNMS(KP866025403, TB, TA);
TD = Tv + TC;
T1L = FNMS(KP866025403, Tu, Tt);
T1M = FMA(KP866025403, TB, TA);
T2y = T1L + T1M;
T5 = T1 + T4;
Ta = T6 + T9;
Tb = T5 + Ta;
T1Z = T5 - Ta;
T1C = FMA(KP866025403, TT, TS);
T1D = FNMS(KP866025403, TW, TV);
T1E = T1C + T1D;
T2D = T1C - T1D;
{
E T1c, T1d, TU, TX;
T1c = Tp + Ts;
T1d = Tw + Tz;
T1e = T1c - T1d;
T1U = T1c + T1d;
TU = FNMS(KP866025403, TT, TS);
TX = FMA(KP866025403, TW, TV);
TY = TU - TX;
T2o = TU + TX;
}
}
}
{
E Tc, Tf, TE, T12, TZ, T10, TH, T11, Th, Tk, TJ, T17, T14, T15, TM;
E T16;
{
E Td, Te, TF, TG;
Tc = Rp[WS(rs, 3)];
Td = Rm[WS(rs, 4)];
Te = Rm[0];
Tf = Td + Te;
TE = FNMS(KP500000000, Tf, Tc);
T12 = Td - Te;
TZ = Ip[WS(rs, 3)];
TF = Im[WS(rs, 4)];
TG = Im[0];
T10 = TF + TG;
TH = TF - TG;
T11 = FMA(KP500000000, T10, TZ);
}
{
E Ti, Tj, TK, TL;
Th = Rm[WS(rs, 2)];
Ti = Rp[WS(rs, 1)];
Tj = Rp[WS(rs, 5)];
Tk = Ti + Tj;
TJ = FNMS(KP500000000, Tk, Th);
T17 = Ti - Tj;
T14 = Im[WS(rs, 2)];
TK = Ip[WS(rs, 5)];
TL = Ip[WS(rs, 1)];
T15 = TK + TL;
TM = TK - TL;
T16 = FMA(KP500000000, T15, T14);
}
{
E Tg, Tl, T1F, T1G;
T13 = FMA(KP866025403, T12, T11);
T18 = FNMS(KP866025403, T17, T16);
T19 = T13 + T18;
T1O = FNMS(KP866025403, T12, T11);
T1P = FMA(KP866025403, T17, T16);
T2E = T1O + T1P;
Tg = Tc + Tf;
Tl = Th + Tk;
Tm = Tg + Tl;
T1V = Tg - Tl;
T1F = FNMS(KP866025403, TH, TE);
T1G = FNMS(KP866025403, TM, TJ);
T1H = T1F + T1G;
T2z = T1F - T1G;
{
E T1f, T1g, TI, TN;
T1f = TZ - T10;
T1g = T15 - T14;
T1h = T1f + T1g;
T20 = T1f - T1g;
TI = FMA(KP866025403, TH, TE);
TN = FMA(KP866025403, TM, TJ);
TO = TI - TN;
T2p = TI + TN;
}
}
}
{
E Tn, T1i, TP, T1a, TQ, T1j, To, T1b, T1k, TR;
Tn = Tb + Tm;
T1i = T1e + T1h;
TP = TD + TO;
T1a = TY - T19;
To = W[0];
TQ = To * TP;
T1j = To * T1a;
TR = W[1];
T1b = FMA(TR, T1a, TQ);
T1k = FNMS(TR, TP, T1j);
Rp[0] = Tn - T1b;
Ip[0] = T1i + T1k;
Rm[0] = Tn + T1b;
Im[0] = T1k - T1i;
}
{
E T1p, T1l, T1n, T1o, T1x, T1s, T1v, T1t, T1z, T1m, T1r;
T1p = T1e - T1h;
T1m = Tb - Tm;
T1l = W[10];
T1n = T1l * T1m;
T1o = W[11];
T1x = T1o * T1m;
T1s = TD - TO;
T1v = TY + T19;
T1r = W[12];
T1t = T1r * T1s;
T1z = T1r * T1v;
{
E T1q, T1y, T1w, T1A, T1u;
T1q = FNMS(T1o, T1p, T1n);
T1y = FMA(T1l, T1p, T1x);
T1u = W[13];
T1w = FMA(T1u, T1v, T1t);
T1A = FNMS(T1u, T1s, T1z);
Rp[WS(rs, 3)] = T1q - T1w;
Ip[WS(rs, 3)] = T1y + T1A;
Rm[WS(rs, 3)] = T1q + T1w;
Im[WS(rs, 3)] = T1A - T1y;
}
}
{
E T1R, T2b, T27, T29, T2a, T2l, T1B, T1J, T1K, T25, T1W, T21, T1X, T23, T2e;
E T2h, T2f, T2j;
{
E T1N, T1Q, T28, T1I, T1T, T2d;
T1N = T1L - T1M;
T1Q = T1O - T1P;
T1R = T1N - T1Q;
T2b = T1N + T1Q;
T28 = T1E + T1H;
T27 = W[14];
T29 = T27 * T28;
T2a = W[15];
T2l = T2a * T28;
T1I = T1E - T1H;
T1B = W[2];
T1J = T1B * T1I;
T1K = W[3];
T25 = T1K * T1I;
T1W = T1U - T1V;
T21 = T1Z + T20;
T1T = W[4];
T1X = T1T * T1W;
T23 = T1T * T21;
T2e = T1V + T1U;
T2h = T1Z - T20;
T2d = W[16];
T2f = T2d * T2e;
T2j = T2d * T2h;
}
{
E T1S, T26, T22, T24, T1Y;
T1S = FNMS(T1K, T1R, T1J);
T26 = FMA(T1B, T1R, T25);
T1Y = W[5];
T22 = FMA(T1Y, T21, T1X);
T24 = FNMS(T1Y, T1W, T23);
Rp[WS(rs, 1)] = T1S - T22;
Ip[WS(rs, 1)] = T24 + T26;
Rm[WS(rs, 1)] = T22 + T1S;
Im[WS(rs, 1)] = T24 - T26;
}
{
E T2c, T2m, T2i, T2k, T2g;
T2c = FNMS(T2a, T2b, T29);
T2m = FMA(T27, T2b, T2l);
T2g = W[17];
T2i = FMA(T2g, T2h, T2f);
T2k = FNMS(T2g, T2e, T2j);
Rp[WS(rs, 4)] = T2c - T2i;
Ip[WS(rs, 4)] = T2k + T2m;
Rm[WS(rs, 4)] = T2i + T2c;
Im[WS(rs, 4)] = T2k - T2m;
}
}
{
E T2v, T2P, T2L, T2N, T2O, T2X, T2n, T2r, T2s, T2H, T2A, T2F, T2B, T2J, T2S;
E T2V, T2T, T2Z;
{
E T2t, T2u, T2M, T2q, T2x, T2R;
T2t = Tv - TC;
T2u = T13 - T18;
T2v = T2t + T2u;
T2P = T2t - T2u;
T2M = T2o - T2p;
T2L = W[18];
T2N = T2L * T2M;
T2O = W[19];
T2X = T2O * T2M;
T2q = T2o + T2p;
T2n = W[6];
T2r = T2n * T2q;
T2s = W[7];
T2H = T2s * T2q;
T2A = T2y + T2z;
T2F = T2D - T2E;
T2x = W[8];
T2B = T2x * T2A;
T2J = T2x * T2F;
T2S = T2y - T2z;
T2V = T2D + T2E;
T2R = W[20];
T2T = T2R * T2S;
T2Z = T2R * T2V;
}
{
E T2w, T2I, T2G, T2K, T2C;
T2w = FNMS(T2s, T2v, T2r);
T2I = FMA(T2n, T2v, T2H);
T2C = W[9];
T2G = FMA(T2C, T2F, T2B);
T2K = FNMS(T2C, T2A, T2J);
Rp[WS(rs, 2)] = T2w - T2G;
Ip[WS(rs, 2)] = T2I + T2K;
Rm[WS(rs, 2)] = T2w + T2G;
Im[WS(rs, 2)] = T2K - T2I;
}
{
E T2Q, T2Y, T2W, T30, T2U;
T2Q = FNMS(T2O, T2P, T2N);
T2Y = FMA(T2L, T2P, T2X);
T2U = W[21];
T2W = FMA(T2U, T2V, T2T);
T30 = FNMS(T2U, T2S, T2Z);
Rp[WS(rs, 5)] = T2Q - T2W;
Ip[WS(rs, 5)] = T2Y + T30;
Rm[WS(rs, 5)] = T2Q + T2W;
Im[WS(rs, 5)] = T30 - T2Y;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, { 96, 22, 46, 0 } };
void X(codelet_hc2cbdft_12) (planner *p) {
X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include rdft/scalar/hc2cb.h */
/*
* This function contains 142 FP additions, 60 FP multiplications,
* (or, 112 additions, 30 multiplications, 30 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E Tv, T1E, TC, T1F, TW, T1x, TT, T1w, T1d, T1N, Tb, T1R, TI, T1z, TN;
E T1A, T17, T1I, T12, T1H, T1g, T1S, Tm, T1O;
{
E T1, Tq, T6, TA, T4, Tp, Tt, TS, T9, Tw, Tz, TV;
T1 = Rp[0];
Tq = Ip[0];
T6 = Rm[WS(rs, 5)];
TA = Im[WS(rs, 5)];
{
E T2, T3, Tr, Ts;
T2 = Rp[WS(rs, 4)];
T3 = Rm[WS(rs, 3)];
T4 = T2 + T3;
Tp = KP866025403 * (T2 - T3);
Tr = Im[WS(rs, 3)];
Ts = Ip[WS(rs, 4)];
Tt = Tr - Ts;
TS = KP866025403 * (Tr + Ts);
}
{
E T7, T8, Tx, Ty;
T7 = Rm[WS(rs, 1)];
T8 = Rp[WS(rs, 2)];
T9 = T7 + T8;
Tw = KP866025403 * (T7 - T8);
Tx = Im[WS(rs, 1)];
Ty = Ip[WS(rs, 2)];
Tz = Tx - Ty;
TV = KP866025403 * (Tx + Ty);
}
{
E Tu, TB, TU, TR;
Tu = FMA(KP500000000, Tt, Tq);
Tv = Tp + Tu;
T1E = Tu - Tp;
TB = FMS(KP500000000, Tz, TA);
TC = Tw + TB;
T1F = TB - Tw;
TU = FNMS(KP500000000, T9, T6);
TW = TU + TV;
T1x = TU - TV;
TR = FNMS(KP500000000, T4, T1);
TT = TR - TS;
T1w = TR + TS;
{
E T1b, T1c, T5, Ta;
T1b = Tq - Tt;
T1c = Tz + TA;
T1d = T1b - T1c;
T1N = T1b + T1c;
T5 = T1 + T4;
Ta = T6 + T9;
Tb = T5 + Ta;
T1R = T5 - Ta;
}
}
}
{
E Tc, T10, Th, T15, Tf, TY, TH, TZ, Tk, T13, TM, T14;
Tc = Rp[WS(rs, 3)];
T10 = Ip[WS(rs, 3)];
Th = Rm[WS(rs, 2)];
T15 = Im[WS(rs, 2)];
{
E Td, Te, TF, TG;
Td = Rm[WS(rs, 4)];
Te = Rm[0];
Tf = Td + Te;
TY = KP866025403 * (Td - Te);
TF = Im[WS(rs, 4)];
TG = Im[0];
TH = KP866025403 * (TF - TG);
TZ = TF + TG;
}
{
E Ti, Tj, TK, TL;
Ti = Rp[WS(rs, 1)];
Tj = Rp[WS(rs, 5)];
Tk = Ti + Tj;
T13 = KP866025403 * (Ti - Tj);
TK = Ip[WS(rs, 5)];
TL = Ip[WS(rs, 1)];
TM = KP866025403 * (TK - TL);
T14 = TK + TL;
}
{
E TE, TJ, T16, T11;
TE = FNMS(KP500000000, Tf, Tc);
TI = TE + TH;
T1z = TE - TH;
TJ = FNMS(KP500000000, Tk, Th);
TN = TJ + TM;
T1A = TJ - TM;
T16 = FMA(KP500000000, T14, T15);
T17 = T13 - T16;
T1I = T13 + T16;
T11 = FMA(KP500000000, TZ, T10);
T12 = TY + T11;
T1H = T11 - TY;
{
E T1e, T1f, Tg, Tl;
T1e = T10 - TZ;
T1f = T14 - T15;
T1g = T1e + T1f;
T1S = T1e - T1f;
Tg = Tc + Tf;
Tl = Th + Tk;
Tm = Tg + Tl;
T1O = Tg - Tl;
}
}
}
{
E Tn, T1h, TP, T1p, T19, T1r, T1n, T1t;
Tn = Tb + Tm;
T1h = T1d + T1g;
{
E TD, TO, TX, T18;
TD = Tv - TC;
TO = TI - TN;
TP = TD + TO;
T1p = TD - TO;
TX = TT - TW;
T18 = T12 - T17;
T19 = TX - T18;
T1r = TX + T18;
{
E T1k, T1m, T1j, T1l;
T1k = Tb - Tm;
T1m = T1d - T1g;
T1j = W[10];
T1l = W[11];
T1n = FNMS(T1l, T1m, T1j * T1k);
T1t = FMA(T1l, T1k, T1j * T1m);
}
}
{
E T1a, T1i, To, TQ;
To = W[0];
TQ = W[1];
T1a = FMA(To, TP, TQ * T19);
T1i = FNMS(TQ, TP, To * T19);
Rp[0] = Tn - T1a;
Ip[0] = T1h + T1i;
Rm[0] = Tn + T1a;
Im[0] = T1i - T1h;
}
{
E T1s, T1u, T1o, T1q;
T1o = W[12];
T1q = W[13];
T1s = FMA(T1o, T1p, T1q * T1r);
T1u = FNMS(T1q, T1p, T1o * T1r);
Rp[WS(rs, 3)] = T1n - T1s;
Ip[WS(rs, 3)] = T1t + T1u;
Rm[WS(rs, 3)] = T1n + T1s;
Im[WS(rs, 3)] = T1u - T1t;
}
}
{
E T1C, T1Y, T1K, T20, T1U, T1V, T26, T27;
{
E T1y, T1B, T1G, T1J;
T1y = T1w + T1x;
T1B = T1z + T1A;
T1C = T1y - T1B;
T1Y = T1y + T1B;
T1G = T1E + T1F;
T1J = T1H - T1I;
T1K = T1G - T1J;
T20 = T1G + T1J;
}
{
E T1P, T1T, T1M, T1Q;
T1P = T1N - T1O;
T1T = T1R + T1S;
T1M = W[4];
T1Q = W[5];
T1U = FMA(T1M, T1P, T1Q * T1T);
T1V = FNMS(T1Q, T1P, T1M * T1T);
}
{
E T23, T25, T22, T24;
T23 = T1O + T1N;
T25 = T1R - T1S;
T22 = W[16];
T24 = W[17];
T26 = FMA(T22, T23, T24 * T25);
T27 = FNMS(T24, T23, T22 * T25);
}
{
E T1L, T1W, T1v, T1D;
T1v = W[2];
T1D = W[3];
T1L = FNMS(T1D, T1K, T1v * T1C);
T1W = FMA(T1D, T1C, T1v * T1K);
Rp[WS(rs, 1)] = T1L - T1U;
Ip[WS(rs, 1)] = T1V + T1W;
Rm[WS(rs, 1)] = T1U + T1L;
Im[WS(rs, 1)] = T1V - T1W;
}
{
E T21, T28, T1X, T1Z;
T1X = W[14];
T1Z = W[15];
T21 = FNMS(T1Z, T20, T1X * T1Y);
T28 = FMA(T1Z, T1Y, T1X * T20);
Rp[WS(rs, 4)] = T21 - T26;
Ip[WS(rs, 4)] = T27 + T28;
Rm[WS(rs, 4)] = T26 + T21;
Im[WS(rs, 4)] = T27 - T28;
}
}
{
E T2c, T2u, T2p, T2B, T2g, T2w, T2l, T2z;
{
E T2a, T2b, T2n, T2o;
T2a = TT + TW;
T2b = TI + TN;
T2c = T2a + T2b;
T2u = T2a - T2b;
T2n = T1w - T1x;
T2o = T1H + T1I;
T2p = T2n - T2o;
T2B = T2n + T2o;
}
{
E T2e, T2f, T2j, T2k;
T2e = Tv + TC;
T2f = T12 + T17;
T2g = T2e + T2f;
T2w = T2e - T2f;
T2j = T1E - T1F;
T2k = T1z - T1A;
T2l = T2j + T2k;
T2z = T2j - T2k;
}
{
E T2h, T2r, T2q, T2s;
{
E T29, T2d, T2i, T2m;
T29 = W[6];
T2d = W[7];
T2h = FNMS(T2d, T2g, T29 * T2c);
T2r = FMA(T2d, T2c, T29 * T2g);
T2i = W[8];
T2m = W[9];
T2q = FMA(T2i, T2l, T2m * T2p);
T2s = FNMS(T2m, T2l, T2i * T2p);
}
Rp[WS(rs, 2)] = T2h - T2q;
Ip[WS(rs, 2)] = T2r + T2s;
Rm[WS(rs, 2)] = T2h + T2q;
Im[WS(rs, 2)] = T2s - T2r;
}
{
E T2x, T2D, T2C, T2E;
{
E T2t, T2v, T2y, T2A;
T2t = W[18];
T2v = W[19];
T2x = FNMS(T2v, T2w, T2t * T2u);
T2D = FMA(T2v, T2u, T2t * T2w);
T2y = W[20];
T2A = W[21];
T2C = FMA(T2y, T2z, T2A * T2B);
T2E = FNMS(T2A, T2z, T2y * T2B);
}
Rp[WS(rs, 5)] = T2x - T2C;
Ip[WS(rs, 5)] = T2D + T2E;
Rm[WS(rs, 5)] = T2x + T2C;
Im[WS(rs, 5)] = T2E - T2D;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, { 112, 30, 30, 0 } };
void X(codelet_hc2cbdft_12) (planner *p) {
X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,892 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 206 FP additions, 100 FP multiplications,
* (or, 136 additions, 30 multiplications, 70 fused multiply/add),
* 66 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E Tf, T20, T32, T3Q, T3f, T3V, TN, T2a, T1m, T2f, T2G, T3G, T2T, T3L, T1F;
E T26, T2J, T2M, T2N, T2U, T2V, T3H, Tu, T25, T3i, T3R, T1a, T2g, T1y, T21;
E T39, T3W, T1p, T2b;
{
E T3, T1e, TA, T1C, T6, Tx, T1h, T1D, Td, T1A, TL, T1k, Ta, T1z, TG;
E T1j;
{
E T1, T2, T1f, T1g;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
T1e = T1 - T2;
{
E Ty, Tz, T4, T5;
Ty = Ip[0];
Tz = Im[WS(rs, 7)];
TA = Ty + Tz;
T1C = Ty - Tz;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
Tx = T4 - T5;
}
T1f = Ip[WS(rs, 4)];
T1g = Im[WS(rs, 3)];
T1h = T1f + T1g;
T1D = T1f - T1g;
{
E Tb, Tc, TH, TI, TJ, TK;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
TH = Tb - Tc;
TI = Im[WS(rs, 1)];
TJ = Ip[WS(rs, 6)];
TK = TI + TJ;
Td = Tb + Tc;
T1A = TJ - TI;
TL = TH + TK;
T1k = TH - TK;
}
{
E T8, T9, TC, TD, TE, TF;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
TC = T8 - T9;
TD = Ip[WS(rs, 2)];
TE = Im[WS(rs, 5)];
TF = TD + TE;
Ta = T8 + T9;
T1z = TD - TE;
TG = TC + TF;
T1j = TC - TF;
}
}
{
E T7, Te, T30, T31;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T20 = T7 - Te;
T30 = TA - Tx;
T31 = T1j - T1k;
T32 = FMA(KP707106781, T31, T30);
T3Q = FNMS(KP707106781, T31, T30);
}
{
E T3d, T3e, TB, TM;
T3d = T1e + T1h;
T3e = TG + TL;
T3f = FNMS(KP707106781, T3e, T3d);
T3V = FMA(KP707106781, T3e, T3d);
TB = Tx + TA;
TM = TG - TL;
TN = FMA(KP707106781, TM, TB);
T2a = FNMS(KP707106781, TM, TB);
}
{
E T1i, T1l, T2E, T2F;
T1i = T1e - T1h;
T1l = T1j + T1k;
T1m = FMA(KP707106781, T1l, T1i);
T2f = FNMS(KP707106781, T1l, T1i);
T2E = T3 - T6;
T2F = T1A - T1z;
T2G = T2E + T2F;
T3G = T2E - T2F;
}
{
E T2R, T2S, T1B, T1E;
T2R = Ta - Td;
T2S = T1C - T1D;
T2T = T2R + T2S;
T3L = T2S - T2R;
T1B = T1z + T1A;
T1E = T1C + T1D;
T1F = T1B + T1E;
T26 = T1E - T1B;
}
}
{
E Ti, T1s, Tl, T1t, TS, TX, T34, T33, T2I, T2H, Tp, T1v, Ts, T1w, T13;
E T18, T37, T36, T2L, T2K;
{
E TT, TR, TO, TW;
{
E Tg, Th, TP, TQ;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
TT = Tg - Th;
TP = Ip[WS(rs, 1)];
TQ = Im[WS(rs, 6)];
TR = TP + TQ;
T1s = TP - TQ;
}
{
E Tj, Tk, TU, TV;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
TO = Tj - Tk;
TU = Ip[WS(rs, 5)];
TV = Im[WS(rs, 2)];
TW = TU + TV;
T1t = TU - TV;
}
TS = TO + TR;
TX = TT - TW;
T34 = TR - TO;
T33 = TT + TW;
T2I = T1s - T1t;
T2H = Ti - Tl;
}
{
E T14, T12, TZ, T17;
{
E Tn, To, T10, T11;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T14 = Tn - To;
T10 = Im[0];
T11 = Ip[WS(rs, 7)];
T12 = T10 + T11;
T1v = T11 - T10;
}
{
E Tq, Tr, T15, T16;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
TZ = Tq - Tr;
T15 = Ip[WS(rs, 3)];
T16 = Im[WS(rs, 4)];
T17 = T15 + T16;
T1w = T15 - T16;
}
T13 = TZ - T12;
T18 = T14 - T17;
T37 = TZ + T12;
T36 = T14 + T17;
T2L = T1v - T1w;
T2K = Tp - Ts;
}
T2J = T2H - T2I;
T2M = T2K + T2L;
T2N = T2J + T2M;
T2U = T2H + T2I;
T2V = T2L - T2K;
T3H = T2V - T2U;
{
E Tm, Tt, T3g, T3h;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T25 = Tm - Tt;
T3g = FNMS(KP414213562, T33, T34);
T3h = FNMS(KP414213562, T36, T37);
T3i = T3g + T3h;
T3R = T3h - T3g;
}
{
E TY, T19, T1u, T1x;
TY = FMA(KP414213562, TX, TS);
T19 = FNMS(KP414213562, T18, T13);
T1a = TY + T19;
T2g = T19 - TY;
T1u = T1s + T1t;
T1x = T1v + T1w;
T1y = T1u + T1x;
T21 = T1x - T1u;
}
{
E T35, T38, T1n, T1o;
T35 = FMA(KP414213562, T34, T33);
T38 = FMA(KP414213562, T37, T36);
T39 = T35 - T38;
T3W = T35 + T38;
T1n = FNMS(KP414213562, TS, TX);
T1o = FMA(KP414213562, T13, T18);
T1p = T1n + T1o;
T2b = T1n - T1o;
}
}
{
E Tv, T1G, T1b, T1q, T1c, T1H, Tw, T1r, T1I, T1d;
Tv = Tf + Tu;
T1G = T1y + T1F;
T1b = FMA(KP923879532, T1a, TN);
T1q = FMA(KP923879532, T1p, T1m);
Tw = W[0];
T1c = Tw * T1b;
T1H = Tw * T1q;
T1d = W[1];
T1r = FMA(T1d, T1q, T1c);
T1I = FNMS(T1d, T1b, T1H);
Rp[0] = Tv - T1r;
Ip[0] = T1G + T1I;
Rm[0] = Tv + T1r;
Im[0] = T1I - T1G;
}
{
E T1N, T1J, T1L, T1M, T1V, T1Q, T1T, T1R, T1X, T1K, T1P;
T1N = T1F - T1y;
T1K = Tf - Tu;
T1J = W[14];
T1L = T1J * T1K;
T1M = W[15];
T1V = T1M * T1K;
T1Q = FNMS(KP923879532, T1a, TN);
T1T = FNMS(KP923879532, T1p, T1m);
T1P = W[16];
T1R = T1P * T1Q;
T1X = T1P * T1T;
{
E T1O, T1W, T1U, T1Y, T1S;
T1O = FNMS(T1M, T1N, T1L);
T1W = FMA(T1J, T1N, T1V);
T1S = W[17];
T1U = FMA(T1S, T1T, T1R);
T1Y = FNMS(T1S, T1Q, T1X);
Rp[WS(rs, 4)] = T1O - T1U;
Ip[WS(rs, 4)] = T1W + T1Y;
Rm[WS(rs, 4)] = T1O + T1U;
Im[WS(rs, 4)] = T1Y - T1W;
}
}
{
E T2r, T2n, T2p, T2q, T2z, T2u, T2x, T2v, T2B, T2o, T2t;
T2r = T26 - T25;
T2o = T20 - T21;
T2n = W[22];
T2p = T2n * T2o;
T2q = W[23];
T2z = T2q * T2o;
T2u = FNMS(KP923879532, T2b, T2a);
T2x = FNMS(KP923879532, T2g, T2f);
T2t = W[24];
T2v = T2t * T2u;
T2B = T2t * T2x;
{
E T2s, T2A, T2y, T2C, T2w;
T2s = FNMS(T2q, T2r, T2p);
T2A = FMA(T2n, T2r, T2z);
T2w = W[25];
T2y = FMA(T2w, T2x, T2v);
T2C = FNMS(T2w, T2u, T2B);
Rp[WS(rs, 6)] = T2s - T2y;
Ip[WS(rs, 6)] = T2A + T2C;
Rm[WS(rs, 6)] = T2s + T2y;
Im[WS(rs, 6)] = T2C - T2A;
}
}
{
E T27, T1Z, T23, T24, T2j, T2c, T2h, T2d, T2l, T22, T29;
T27 = T25 + T26;
T22 = T20 + T21;
T1Z = W[6];
T23 = T1Z * T22;
T24 = W[7];
T2j = T24 * T22;
T2c = FMA(KP923879532, T2b, T2a);
T2h = FMA(KP923879532, T2g, T2f);
T29 = W[8];
T2d = T29 * T2c;
T2l = T29 * T2h;
{
E T28, T2k, T2i, T2m, T2e;
T28 = FNMS(T24, T27, T23);
T2k = FMA(T1Z, T27, T2j);
T2e = W[9];
T2i = FMA(T2e, T2h, T2d);
T2m = FNMS(T2e, T2c, T2l);
Rp[WS(rs, 2)] = T28 - T2i;
Ip[WS(rs, 2)] = T2k + T2m;
Rm[WS(rs, 2)] = T28 + T2i;
Im[WS(rs, 2)] = T2m - T2k;
}
}
{
E T3N, T47, T43, T45, T46, T4f, T3F, T3J, T3K, T3Z, T3S, T3X, T3T, T41, T4a;
E T4d, T4b, T4h;
{
E T3M, T44, T3I, T3P, T49;
T3M = T2J - T2M;
T3N = FMA(KP707106781, T3M, T3L);
T47 = FNMS(KP707106781, T3M, T3L);
T44 = FNMS(KP707106781, T3H, T3G);
T43 = W[26];
T45 = T43 * T44;
T46 = W[27];
T4f = T46 * T44;
T3I = FMA(KP707106781, T3H, T3G);
T3F = W[10];
T3J = T3F * T3I;
T3K = W[11];
T3Z = T3K * T3I;
T3S = FMA(KP923879532, T3R, T3Q);
T3X = FNMS(KP923879532, T3W, T3V);
T3P = W[12];
T3T = T3P * T3S;
T41 = T3P * T3X;
T4a = FNMS(KP923879532, T3R, T3Q);
T4d = FMA(KP923879532, T3W, T3V);
T49 = W[28];
T4b = T49 * T4a;
T4h = T49 * T4d;
}
{
E T3O, T40, T3Y, T42, T3U;
T3O = FNMS(T3K, T3N, T3J);
T40 = FMA(T3F, T3N, T3Z);
T3U = W[13];
T3Y = FMA(T3U, T3X, T3T);
T42 = FNMS(T3U, T3S, T41);
Rp[WS(rs, 3)] = T3O - T3Y;
Ip[WS(rs, 3)] = T40 + T42;
Rm[WS(rs, 3)] = T3O + T3Y;
Im[WS(rs, 3)] = T42 - T40;
}
{
E T48, T4g, T4e, T4i, T4c;
T48 = FNMS(T46, T47, T45);
T4g = FMA(T43, T47, T4f);
T4c = W[29];
T4e = FMA(T4c, T4d, T4b);
T4i = FNMS(T4c, T4a, T4h);
Rp[WS(rs, 7)] = T48 - T4e;
Ip[WS(rs, 7)] = T4g + T4i;
Rm[WS(rs, 7)] = T48 + T4e;
Im[WS(rs, 7)] = T4i - T4g;
}
}
{
E T2X, T3t, T3p, T3r, T3s, T3B, T2D, T2P, T2Q, T3l, T3a, T3j, T3b, T3n, T3w;
E T3z, T3x, T3D;
{
E T2W, T3q, T2O, T2Z, T3v;
T2W = T2U + T2V;
T2X = FMA(KP707106781, T2W, T2T);
T3t = FNMS(KP707106781, T2W, T2T);
T3q = FNMS(KP707106781, T2N, T2G);
T3p = W[18];
T3r = T3p * T3q;
T3s = W[19];
T3B = T3s * T3q;
T2O = FMA(KP707106781, T2N, T2G);
T2D = W[2];
T2P = T2D * T2O;
T2Q = W[3];
T3l = T2Q * T2O;
T3a = FMA(KP923879532, T39, T32);
T3j = FNMS(KP923879532, T3i, T3f);
T2Z = W[4];
T3b = T2Z * T3a;
T3n = T2Z * T3j;
T3w = FNMS(KP923879532, T39, T32);
T3z = FMA(KP923879532, T3i, T3f);
T3v = W[20];
T3x = T3v * T3w;
T3D = T3v * T3z;
}
{
E T2Y, T3m, T3k, T3o, T3c;
T2Y = FNMS(T2Q, T2X, T2P);
T3m = FMA(T2D, T2X, T3l);
T3c = W[5];
T3k = FMA(T3c, T3j, T3b);
T3o = FNMS(T3c, T3a, T3n);
Rp[WS(rs, 1)] = T2Y - T3k;
Ip[WS(rs, 1)] = T3m + T3o;
Rm[WS(rs, 1)] = T2Y + T3k;
Im[WS(rs, 1)] = T3o - T3m;
}
{
E T3u, T3C, T3A, T3E, T3y;
T3u = FNMS(T3s, T3t, T3r);
T3C = FMA(T3p, T3t, T3B);
T3y = W[21];
T3A = FMA(T3y, T3z, T3x);
T3E = FNMS(T3y, T3w, T3D);
Rp[WS(rs, 5)] = T3u - T3A;
Ip[WS(rs, 5)] = T3C + T3E;
Rm[WS(rs, 5)] = T3u + T3A;
Im[WS(rs, 5)] = T3E - T3C;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, { 136, 30, 70, 0 } };
void X(codelet_hc2cbdft_16) (planner *p) {
X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include rdft/scalar/hc2cb.h */
/*
* This function contains 206 FP additions, 84 FP multiplications,
* (or, 168 additions, 46 multiplications, 38 fused multiply/add),
* 60 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
E T1x, T1V;
{
E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
E T1z;
{
E T1, T2, Ty, Tz;
T1 = Rp[0];
T2 = Rm[WS(rs, 7)];
T3 = T1 + T2;
T1j = T1 - T2;
Ty = Ip[0];
Tz = Im[WS(rs, 7)];
TA = Ty + Tz;
T1B = Ty - Tz;
}
{
E T4, T5, T1k, T1l;
T4 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 3)];
T6 = T4 + T5;
Tx = T4 - T5;
T1k = Ip[WS(rs, 4)];
T1l = Im[WS(rs, 3)];
T1m = T1k + T1l;
T1C = T1k - T1l;
}
{
E T8, T9, TD, TE;
T8 = Rp[WS(rs, 2)];
T9 = Rm[WS(rs, 5)];
Ta = T8 + T9;
TC = T8 - T9;
TD = Ip[WS(rs, 2)];
TE = Im[WS(rs, 5)];
TF = TD + TE;
T1y = TD - TE;
}
{
E Tb, Tc, TI, TJ;
Tb = Rm[WS(rs, 1)];
Tc = Rp[WS(rs, 6)];
Td = Tb + Tc;
TH = Tb - Tc;
TI = Im[WS(rs, 1)];
TJ = Ip[WS(rs, 6)];
TK = TI + TJ;
T1z = TJ - TI;
}
{
E T7, Te, TG, TL;
TB = Tx + TA;
T2L = TA - Tx;
T30 = T1j + T1m;
T1n = T1j - T1m;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
T1U = T7 - Te;
{
E T2F, T2G, T1A, T1D;
T2F = Ta - Td;
T2G = T1B - T1C;
T2H = T2F + T2G;
T3p = T2G - T2F;
T1A = T1y + T1z;
T1D = T1B + T1C;
T1E = T1A + T1D;
T1Z = T1D - T1A;
}
TG = TC + TF;
TL = TH + TK;
TM = KP707106781 * (TG - TL);
T31 = KP707106781 * (TG + TL);
{
E T2q, T2r, T1g, T1h;
T2q = T3 - T6;
T2r = T1z - T1y;
T2s = T2q + T2r;
T3k = T2q - T2r;
T1g = TC - TF;
T1h = TH - TK;
T1i = KP707106781 * (T1g + T1h);
T2M = KP707106781 * (T1g - T1h);
}
}
}
{
E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
E T1v;
{
E Tg, Th, TP, TQ;
Tg = Rp[WS(rs, 1)];
Th = Rm[WS(rs, 6)];
Ti = Tg + Th;
TT = Tg - Th;
TP = Ip[WS(rs, 1)];
TQ = Im[WS(rs, 6)];
TR = TP + TQ;
T1r = TP - TQ;
}
{
E Tj, Tk, TU, TV;
Tj = Rp[WS(rs, 5)];
Tk = Rm[WS(rs, 2)];
Tl = Tj + Tk;
TO = Tj - Tk;
TU = Ip[WS(rs, 5)];
TV = Im[WS(rs, 2)];
TW = TU + TV;
T1s = TU - TV;
}
{
E Tn, To, T10, T11;
Tn = Rm[0];
To = Rp[WS(rs, 7)];
Tp = Tn + To;
T14 = Tn - To;
T10 = Im[0];
T11 = Ip[WS(rs, 7)];
T12 = T10 + T11;
T1u = T11 - T10;
}
{
E Tq, Tr, T15, T16;
Tq = Rp[WS(rs, 3)];
Tr = Rm[WS(rs, 4)];
Ts = Tq + Tr;
TZ = Tq - Tr;
T15 = Ip[WS(rs, 3)];
T16 = Im[WS(rs, 4)];
T17 = T15 + T16;
T1v = T15 - T16;
}
{
E Tm, Tt, T2O, T2P;
Tm = Ti + Tl;
Tt = Tp + Ts;
Tu = Tm + Tt;
T1Y = Tm - Tt;
T2O = TR - TO;
T2P = TT + TW;
T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
}
{
E T2R, T2S, TS, TX;
T2R = TZ + T12;
T2S = T14 + T17;
T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
TS = TO + TR;
TX = TT - TW;
TY = FMA(KP923879532, TS, KP382683432 * TX);
T1d = FNMS(KP382683432, TS, KP923879532 * TX);
}
{
E T13, T18, T2t, T2u;
T13 = TZ - T12;
T18 = T14 - T17;
T19 = FNMS(KP382683432, T18, KP923879532 * T13);
T1e = FMA(KP382683432, T13, KP923879532 * T18);
T2t = Ti - Tl;
T2u = T1r - T1s;
T2v = T2t - T2u;
T2C = T2t + T2u;
}
{
E T2w, T2x, T1t, T1w;
T2w = Tp - Ts;
T2x = T1u - T1v;
T2y = T2w + T2x;
T2D = T2x - T2w;
T1t = T1r + T1s;
T1w = T1u + T1v;
T1x = T1t + T1w;
T1V = T1w - T1t;
}
}
{
E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
Tv = Tf + Tu;
T1F = T1x + T1E;
{
E TN, T1a, T1f, T1o;
TN = TB + TM;
T1a = TY + T19;
T1b = TN + T1a;
T1N = TN - T1a;
T1f = T1d + T1e;
T1o = T1i + T1n;
T1p = T1f + T1o;
T1P = T1o - T1f;
{
E T1I, T1K, T1H, T1J;
T1I = Tf - Tu;
T1K = T1E - T1x;
T1H = W[14];
T1J = W[15];
T1L = FNMS(T1J, T1K, T1H * T1I);
T1R = FMA(T1J, T1I, T1H * T1K);
}
}
{
E T1q, T1G, Tw, T1c;
Tw = W[0];
T1c = W[1];
T1q = FMA(Tw, T1b, T1c * T1p);
T1G = FNMS(T1c, T1b, Tw * T1p);
Rp[0] = Tv - T1q;
Ip[0] = T1F + T1G;
Rm[0] = Tv + T1q;
Im[0] = T1G - T1F;
}
{
E T1Q, T1S, T1M, T1O;
T1M = W[16];
T1O = W[17];
T1Q = FMA(T1M, T1N, T1O * T1P);
T1S = FNMS(T1O, T1N, T1M * T1P);
Rp[WS(rs, 4)] = T1L - T1Q;
Ip[WS(rs, 4)] = T1R + T1S;
Rm[WS(rs, 4)] = T1L + T1Q;
Im[WS(rs, 4)] = T1S - T1R;
}
}
{
E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
{
E T23, T24, T27, T28;
T23 = TB - TM;
T24 = T1d - T1e;
T25 = T23 + T24;
T2j = T23 - T24;
T27 = T19 - TY;
T28 = T1n - T1i;
T29 = T27 + T28;
T2l = T28 - T27;
}
{
E T1W, T20, T1T, T1X;
T1W = T1U + T1V;
T20 = T1Y + T1Z;
T1T = W[6];
T1X = W[7];
T21 = FNMS(T1X, T20, T1T * T1W);
T2b = FMA(T1X, T1W, T1T * T20);
}
{
E T2e, T2g, T2d, T2f;
T2e = T1U - T1V;
T2g = T1Z - T1Y;
T2d = W[22];
T2f = W[23];
T2h = FNMS(T2f, T2g, T2d * T2e);
T2n = FMA(T2f, T2e, T2d * T2g);
}
{
E T2a, T2c, T22, T26;
T22 = W[8];
T26 = W[9];
T2a = FMA(T22, T25, T26 * T29);
T2c = FNMS(T26, T25, T22 * T29);
Rp[WS(rs, 2)] = T21 - T2a;
Ip[WS(rs, 2)] = T2b + T2c;
Rm[WS(rs, 2)] = T21 + T2a;
Im[WS(rs, 2)] = T2c - T2b;
}
{
E T2m, T2o, T2i, T2k;
T2i = W[24];
T2k = W[25];
T2m = FMA(T2i, T2j, T2k * T2l);
T2o = FNMS(T2k, T2j, T2i * T2l);
Rp[WS(rs, 6)] = T2h - T2m;
Ip[WS(rs, 6)] = T2n + T2o;
Rm[WS(rs, 6)] = T2h + T2m;
Im[WS(rs, 6)] = T2o - T2n;
}
}
{
E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
T2z = KP707106781 * (T2v + T2y);
T2A = T2s + T2z;
T38 = T2s - T2z;
T2E = KP707106781 * (T2C + T2D);
T2I = T2E + T2H;
T3a = T2H - T2E;
{
E T2N, T2U, T2Z, T32;
T2N = T2L + T2M;
T2U = T2Q - T2T;
T2V = T2N + T2U;
T3d = T2N - T2U;
T2Z = T2X + T2Y;
T32 = T30 - T31;
T33 = T2Z + T32;
T3f = T32 - T2Z;
}
{
E T2J, T35, T34, T36;
{
E T2p, T2B, T2K, T2W;
T2p = W[2];
T2B = W[3];
T2J = FNMS(T2B, T2I, T2p * T2A);
T35 = FMA(T2B, T2A, T2p * T2I);
T2K = W[4];
T2W = W[5];
T34 = FMA(T2K, T2V, T2W * T33);
T36 = FNMS(T2W, T2V, T2K * T33);
}
Rp[WS(rs, 1)] = T2J - T34;
Ip[WS(rs, 1)] = T35 + T36;
Rm[WS(rs, 1)] = T2J + T34;
Im[WS(rs, 1)] = T36 - T35;
}
{
E T3b, T3h, T3g, T3i;
{
E T37, T39, T3c, T3e;
T37 = W[18];
T39 = W[19];
T3b = FNMS(T39, T3a, T37 * T38);
T3h = FMA(T39, T38, T37 * T3a);
T3c = W[20];
T3e = W[21];
T3g = FMA(T3c, T3d, T3e * T3f);
T3i = FNMS(T3e, T3d, T3c * T3f);
}
Rp[WS(rs, 5)] = T3b - T3g;
Ip[WS(rs, 5)] = T3h + T3i;
Rm[WS(rs, 5)] = T3b + T3g;
Im[WS(rs, 5)] = T3i - T3h;
}
}
{
E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
T3l = KP707106781 * (T2D - T2C);
T3m = T3k + T3l;
T3E = T3k - T3l;
T3o = KP707106781 * (T2v - T2y);
T3q = T3o + T3p;
T3G = T3p - T3o;
{
E T3t, T3u, T3x, T3y;
T3t = T2L - T2M;
T3u = T2X - T2Y;
T3v = T3t + T3u;
T3J = T3t - T3u;
T3x = T31 + T30;
T3y = T2Q + T2T;
T3z = T3x - T3y;
T3L = T3y + T3x;
}
{
E T3r, T3B, T3A, T3C;
{
E T3j, T3n, T3s, T3w;
T3j = W[10];
T3n = W[11];
T3r = FNMS(T3n, T3q, T3j * T3m);
T3B = FMA(T3n, T3m, T3j * T3q);
T3s = W[12];
T3w = W[13];
T3A = FMA(T3s, T3v, T3w * T3z);
T3C = FNMS(T3w, T3v, T3s * T3z);
}
Rp[WS(rs, 3)] = T3r - T3A;
Ip[WS(rs, 3)] = T3B + T3C;
Rm[WS(rs, 3)] = T3r + T3A;
Im[WS(rs, 3)] = T3C - T3B;
}
{
E T3H, T3N, T3M, T3O;
{
E T3D, T3F, T3I, T3K;
T3D = W[26];
T3F = W[27];
T3H = FNMS(T3F, T3G, T3D * T3E);
T3N = FMA(T3F, T3E, T3D * T3G);
T3I = W[28];
T3K = W[29];
T3M = FMA(T3I, T3J, T3K * T3L);
T3O = FNMS(T3K, T3J, T3I * T3L);
}
Rp[WS(rs, 7)] = T3H - T3M;
Ip[WS(rs, 7)] = T3N + T3O;
Rm[WS(rs, 7)] = T3H + T3M;
Im[WS(rs, 7)] = T3O - T3N;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, { 168, 46, 38, 0 } };
void X(codelet_hc2cbdft_16) (planner *p) {
X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,131 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:47:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cbdft_2 -include rdft/scalar/hc2cb.h */
/*
* This function contains 10 FP additions, 4 FP multiplications,
* (or, 8 additions, 2 multiplications, 2 fused multiply/add),
* 15 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T3, Ta, Tc, T9, Td, T4, T8, Tb, Te;
{
E T1, T2, T5, T6, T7;
T1 = Ip[0];
T2 = Im[0];
T3 = T1 - T2;
Ta = T1 + T2;
T5 = Rp[0];
T6 = Rm[0];
T7 = T5 - T6;
Tc = T5 + T6;
T9 = W[1];
Td = T9 * T7;
T4 = W[0];
T8 = T4 * T7;
}
Tb = FNMS(T9, Ta, T8);
Ip[0] = T3 + Tb;
Im[0] = Tb - T3;
Te = FMA(T4, Ta, Td);
Rp[0] = Tc - Te;
Rm[0] = Tc + Te;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cbdft_2", twinstr, &GENUS, { 8, 2, 2, 0 } };
void X(codelet_hc2cbdft_2) (planner *p) {
X(khc2c_register) (p, hc2cbdft_2, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 2 -dif -name hc2cbdft_2 -include rdft/scalar/hc2cb.h */
/*
* This function contains 10 FP additions, 4 FP multiplications,
* (or, 8 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cb.h"
static void hc2cbdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T3, T9, T7, Tb;
{
E T1, T2, T5, T6;
T1 = Ip[0];
T2 = Im[0];
T3 = T1 - T2;
T9 = T1 + T2;
T5 = Rp[0];
T6 = Rm[0];
T7 = T5 - T6;
Tb = T5 + T6;
}
{
E Ta, Tc, T4, T8;
T4 = W[0];
T8 = W[1];
Ta = FNMS(T8, T9, T4 * T7);
Tc = FMA(T8, T7, T4 * T9);
Ip[0] = T3 + Ta;
Rp[0] = Tb - Tc;
Im[0] = Ta - T3;
Rm[0] = Tb + Tc;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cbdft_2", twinstr, &GENUS, { 8, 2, 2, 0 } };
void X(codelet_hc2cbdft_2) (planner *p) {
X(khc2c_register) (p, hc2cbdft_2, &desc, HC2C_VIA_DFT);
}
#endif

Some files were not shown because too many files have changed in this diff Show More