Updates
This commit is contained in:
24
fftw-3.3.10/rdft/simd/common/Makefile.am
Normal file
24
fftw-3.3.10/rdft/simd/common/Makefile.am
Normal file
@@ -0,0 +1,24 @@
|
||||
# include the list of codelets
|
||||
|
||||
include $(top_srcdir)/rdft/simd/codlist.mk
|
||||
|
||||
ALL_CODELETS = $(SIMD_CODELETS)
|
||||
BUILT_SOURCES= $(SIMD_CODELETS) $(CODLIST)
|
||||
EXTRA_DIST = $(BUILT_SOURCES) genus.c
|
||||
INCLUDE_SIMD_HEADER="\#include SIMD_HEADER"
|
||||
XRENAME=XSIMD
|
||||
SOLVTAB_NAME = XSIMD(solvtab_rdft)
|
||||
|
||||
# include special rules for regenerating codelets.
|
||||
include $(top_srcdir)/support/Makefile.codelets
|
||||
|
||||
if MAINTAINER_MODE
|
||||
FLAGS_HC2C=-simd $(FLAGS_COMMON) -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw
|
||||
|
||||
hc2cfdftv_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT_C)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dit -name hc2cfdftv_$* -include "rdft/simd/hc2cfv.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cbdftv_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT_C)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dif -sign 1 -name hc2cbdftv_$* -include "rdft/simd/hc2cbv.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
endif # MAINTAINER_MODE
|
||||
601
fftw-3.3.10/rdft/simd/common/Makefile.in
Normal file
601
fftw-3.3.10/rdft/simd/common/Makefile.in
Normal file
@@ -0,0 +1,601 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
# include the list of codelets
|
||||
|
||||
# This file contains a standard list of RDFT SIMD codelets. It is
|
||||
# included by common/Makefile to generate the C files with the actual
|
||||
# codelets in them. It is included by {sse,sse2,...}/Makefile to
|
||||
# generate and compile stub files that include common/*.c
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
# -*- makefile -*-
|
||||
# This file contains special make rules to generate codelets.
|
||||
# Most of this file requires GNU make .
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = rdft/simd/common
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
SOURCES =
|
||||
DIST_SOURCES =
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in \
|
||||
$(top_srcdir)/rdft/simd/codlist.mk \
|
||||
$(top_srcdir)/support/Makefile.codelets
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
HC2CFDFTV = hc2cfdftv_2.c hc2cfdftv_4.c hc2cfdftv_6.c hc2cfdftv_8.c \
|
||||
hc2cfdftv_10.c hc2cfdftv_12.c hc2cfdftv_16.c hc2cfdftv_32.c \
|
||||
hc2cfdftv_20.c
|
||||
|
||||
HC2CBDFTV = hc2cbdftv_2.c hc2cbdftv_4.c hc2cbdftv_6.c hc2cbdftv_8.c \
|
||||
hc2cbdftv_10.c hc2cbdftv_12.c hc2cbdftv_16.c hc2cbdftv_32.c \
|
||||
hc2cbdftv_20.c
|
||||
|
||||
|
||||
###########################################################################
|
||||
SIMD_CODELETS = $(HC2CFDFTV) $(HC2CBDFTV)
|
||||
ALL_CODELETS = $(SIMD_CODELETS)
|
||||
BUILT_SOURCES = $(SIMD_CODELETS) $(CODLIST)
|
||||
EXTRA_DIST = $(BUILT_SOURCES) genus.c
|
||||
INCLUDE_SIMD_HEADER = "\#include SIMD_HEADER"
|
||||
XRENAME = XSIMD
|
||||
SOLVTAB_NAME = XSIMD(solvtab_rdft)
|
||||
CODLIST = codlist.c
|
||||
CODELET_NAME = codelet_
|
||||
|
||||
#INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
|
||||
@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
|
||||
@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
|
||||
@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
|
||||
@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
|
||||
@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
|
||||
@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE)
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
|
||||
@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
|
||||
@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
|
||||
|
||||
# include special rules for regenerating codelets.
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_HC2C = -simd $(FLAGS_COMMON) -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw
|
||||
all: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) all-am
|
||||
|
||||
.SUFFIXES:
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu rdft/simd/common/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu rdft/simd/common/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
$(top_srcdir)/rdft/simd/codlist.mk $(top_srcdir)/support/Makefile.codelets $(am__empty):
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
tags TAGS:
|
||||
|
||||
ctags CTAGS:
|
||||
|
||||
cscope cscopelist:
|
||||
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) check-am
|
||||
all-am: Makefile
|
||||
installdirs:
|
||||
install: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) install-am
|
||||
install-exec: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) install-exec-am
|
||||
install-data: install-data-am
|
||||
uninstall: uninstall-am
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-am
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
|
||||
clean: clean-am
|
||||
|
||||
clean-am: clean-generic clean-libtool mostlyclean-am
|
||||
|
||||
distclean: distclean-am
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-generic
|
||||
|
||||
dvi: dvi-am
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-am
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-am
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-am
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-am
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-am
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-am
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-am
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-am
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic \
|
||||
maintainer-clean-local
|
||||
|
||||
mostlyclean: mostlyclean-am
|
||||
|
||||
mostlyclean-am: mostlyclean-generic mostlyclean-libtool
|
||||
|
||||
pdf: pdf-am
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-am
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: all check install install-am install-exec install-strip
|
||||
|
||||
.PHONY: all all-am check check-am clean clean-generic clean-libtool \
|
||||
cscopelist-am ctags-am distclean distclean-generic \
|
||||
distclean-libtool distdir dvi dvi-am html html-am info info-am \
|
||||
install install-am install-data install-data-am install-dvi \
|
||||
install-dvi-am install-exec install-exec-am install-html \
|
||||
install-html-am install-info install-info-am install-man \
|
||||
install-pdf install-pdf-am install-ps install-ps-am \
|
||||
install-strip installcheck installcheck-am installdirs \
|
||||
maintainer-clean maintainer-clean-generic \
|
||||
maintainer-clean-local mostlyclean mostlyclean-generic \
|
||||
mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
|
||||
uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# only delete codlist.c in maintainer-mode, since it is included in the dist
|
||||
# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
|
||||
maintainer-clean-local:
|
||||
rm -f $(CODLIST)
|
||||
|
||||
# rule to build codlist
|
||||
@MAINTAINER_MODE_TRUE@$(CODLIST): Makefile
|
||||
@MAINTAINER_MODE_TRUE@ ( \
|
||||
@MAINTAINER_MODE_TRUE@ echo "#include \"kernel/ifftw.h\""; \
|
||||
@MAINTAINER_MODE_TRUE@ echo $(INCLUDE_SIMD_HEADER); \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
|
||||
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
|
||||
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);"; \
|
||||
@MAINTAINER_MODE_TRUE@ fi \
|
||||
@MAINTAINER_MODE_TRUE@ done; \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "extern const solvtab $(SOLVTAB_NAME);"; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "const solvtab $(SOLVTAB_NAME) = {"; \
|
||||
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
|
||||
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
|
||||
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
|
||||
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),"; \
|
||||
@MAINTAINER_MODE_TRUE@ fi \
|
||||
@MAINTAINER_MODE_TRUE@ done; \
|
||||
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB_END"; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "};"; \
|
||||
@MAINTAINER_MODE_TRUE@ ) >$@
|
||||
|
||||
# cancel the hideous builtin rules that cause an infinite loop
|
||||
@MAINTAINER_MODE_TRUE@%: %.o
|
||||
@MAINTAINER_MODE_TRUE@%: %.s
|
||||
@MAINTAINER_MODE_TRUE@%: %.c
|
||||
@MAINTAINER_MODE_TRUE@%: %.S
|
||||
|
||||
@MAINTAINER_MODE_TRUE@hc2cfdftv_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT_C)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dit -name hc2cfdftv_$* -include "rdft/simd/hc2cfv.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@hc2cbdftv_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT_C)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT_C) $(FLAGS_HC2C) -n $* -dif -sign 1 -name hc2cbdftv_$* -include "rdft/simd/hc2cbv.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
45
fftw-3.3.10/rdft/simd/common/codlist.c
Normal file
45
fftw-3.3.10/rdft/simd/common/codlist.c
Normal file
@@ -0,0 +1,45 @@
|
||||
#include "kernel/ifftw.h"
|
||||
#include SIMD_HEADER
|
||||
|
||||
extern void XSIMD(codelet_hc2cfdftv_2)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_4)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_6)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_8)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_10)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_12)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_16)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_32)(planner *);
|
||||
extern void XSIMD(codelet_hc2cfdftv_20)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_2)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_4)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_6)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_8)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_10)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_12)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_16)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_32)(planner *);
|
||||
extern void XSIMD(codelet_hc2cbdftv_20)(planner *);
|
||||
|
||||
|
||||
extern const solvtab XSIMD(solvtab_rdft);
|
||||
const solvtab XSIMD(solvtab_rdft) = {
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_2)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_4)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_6)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_8)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_10)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_12)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_16)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_32)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cfdftv_20)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_2)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_4)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_6)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_8)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_10)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_12)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_16)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_32)),
|
||||
SOLVTAB(XSIMD(codelet_hc2cbdftv_20)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
60
fftw-3.3.10/rdft/simd/common/genus.c
Normal file
60
fftw-3.3.10/rdft/simd/common/genus.c
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
#include SIMD_HEADER
|
||||
|
||||
#define EXTERN_CONST(t, x) extern const t x; const t x
|
||||
|
||||
static int hc2cbv_okp(const R *Rp, const R *Ip, const R *Rm, const R *Im,
|
||||
INT rs, INT mb, INT me, INT ms,
|
||||
const planner *plnr)
|
||||
{
|
||||
return (1
|
||||
&& !NO_SIMDP(plnr)
|
||||
&& SIMD_STRIDE_OK(rs)
|
||||
&& SIMD_VSTRIDE_OK(ms)
|
||||
&& ((me - mb) % VL) == 0
|
||||
&& ((mb - 1) % VL) == 0 /* twiddle factors alignment */
|
||||
&& ALIGNED(Rp)
|
||||
&& ALIGNED(Rm)
|
||||
&& Ip == Rp + 1
|
||||
&& Im == Rm + 1);
|
||||
}
|
||||
|
||||
EXTERN_CONST(hc2c_genus, XSIMD(rdft_hc2cbv_genus)) = { hc2cbv_okp, HC2R, VL };
|
||||
|
||||
static int hc2cfv_okp(const R *Rp, const R *Ip, const R *Rm, const R *Im,
|
||||
INT rs, INT mb, INT me, INT ms,
|
||||
const planner *plnr)
|
||||
{
|
||||
return (1
|
||||
&& !NO_SIMDP(plnr)
|
||||
&& SIMD_STRIDE_OK(rs)
|
||||
&& SIMD_VSTRIDE_OK(ms)
|
||||
&& ((me - mb) % VL) == 0
|
||||
&& ((mb - 1) % VL) == 0 /* twiddle factors alignment */
|
||||
&& ALIGNED(Rp)
|
||||
&& ALIGNED(Rm)
|
||||
&& Ip == Rp + 1
|
||||
&& Im == Rm + 1);
|
||||
}
|
||||
|
||||
EXTERN_CONST(hc2c_genus, XSIMD(rdft_hc2cfv_genus)) = { hc2cfv_okp, R2HC, VL };
|
||||
295
fftw-3.3.10/rdft/simd/common/hc2cbdftv_10.c
Normal file
295
fftw-3.3.10/rdft/simd/common/hc2cbdftv_10.c
Normal file
@@ -0,0 +1,295 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dif -sign 1 -name hc2cbdftv_10 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 61 FP additions, 50 FP multiplications,
|
||||
* (or, 33 additions, 22 multiplications, 28 fused multiply/add),
|
||||
* 76 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
V T4, Ts, Tl, TB, Tj, Tk, Tz, TA, TF, TV, Tp, TL, Te, Tw, Th;
|
||||
V Tx, Ti, Ty, T7, Tt, Ta, Tu, Tb, Tv, T2, T3, Tc, Td, Tf, Tg;
|
||||
V T5, T6, T8, T9, TD, TE, Tn, To;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T4 = VFNMSCONJ(T3, T2);
|
||||
Ts = VFMACONJ(T3, T2);
|
||||
Tc = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Td = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Te = VFNMSCONJ(Td, Tc);
|
||||
Tw = VFMACONJ(Td, Tc);
|
||||
Tf = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Th = VFMSCONJ(Tg, Tf);
|
||||
Tx = VFMACONJ(Tg, Tf);
|
||||
Ti = VADD(Te, Th);
|
||||
Ty = VADD(Tw, Tx);
|
||||
T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T7 = VFNMSCONJ(T6, T5);
|
||||
Tt = VFMACONJ(T6, T5);
|
||||
T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T9 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ta = VFMSCONJ(T9, T8);
|
||||
Tu = VFMACONJ(T9, T8);
|
||||
Tb = VADD(T7, Ta);
|
||||
Tv = VADD(Tt, Tu);
|
||||
Tl = VSUB(Tb, Ti);
|
||||
TB = VSUB(Tv, Ty);
|
||||
Tj = VADD(Tb, Ti);
|
||||
Tk = VFNMS(LDK(KP250000000), Tj, T4);
|
||||
Tz = VADD(Tv, Ty);
|
||||
TA = VFNMS(LDK(KP250000000), Tz, Ts);
|
||||
TD = VSUB(Tw, Tx);
|
||||
TE = VSUB(Tt, Tu);
|
||||
TF = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TD));
|
||||
TV = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TD, TE));
|
||||
Tn = VSUB(Te, Th);
|
||||
To = VSUB(T7, Ta);
|
||||
Tp = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), To, Tn));
|
||||
TL = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, To));
|
||||
{
|
||||
V T17, TS, Tq, T10, TW, T12, TM, T16, TG, TO, TR, Tm, T1, TZ, TU;
|
||||
V TT, T11, TK, TJ, T15, TC, Tr, TN, TH, TP, T19, TI, T18, T14, TY;
|
||||
V TQ, T13, TX;
|
||||
T17 = VADD(Ts, Tz);
|
||||
TR = LDW(&(W[TWVL * 8]));
|
||||
TS = VZMULI(TR, VADD(T4, Tj));
|
||||
Tm = VFNMS(LDK(KP559016994), Tl, Tk);
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
Tq = VZMULI(T1, VFMAI(Tp, Tm));
|
||||
TZ = LDW(&(W[TWVL * 12]));
|
||||
T10 = VZMULI(TZ, VFNMSI(Tp, Tm));
|
||||
TU = VFMA(LDK(KP559016994), TB, TA);
|
||||
TT = LDW(&(W[TWVL * 6]));
|
||||
TW = VZMUL(TT, VFNMSI(TV, TU));
|
||||
T11 = LDW(&(W[TWVL * 10]));
|
||||
T12 = VZMUL(T11, VFMAI(TV, TU));
|
||||
TK = VFMA(LDK(KP559016994), Tl, Tk);
|
||||
TJ = LDW(&(W[TWVL * 16]));
|
||||
TM = VZMULI(TJ, VFNMSI(TL, TK));
|
||||
T15 = LDW(&(W[0]));
|
||||
T16 = VZMULI(T15, VFMAI(TL, TK));
|
||||
TC = VFNMS(LDK(KP559016994), TB, TA);
|
||||
Tr = LDW(&(W[TWVL * 2]));
|
||||
TG = VZMUL(Tr, VFNMSI(TF, TC));
|
||||
TN = LDW(&(W[TWVL * 14]));
|
||||
TO = VZMUL(TN, VFMAI(TF, TC));
|
||||
TH = VADD(Tq, TG);
|
||||
ST(&(Rp[WS(rs, 1)]), TH, ms, &(Rp[WS(rs, 1)]));
|
||||
TP = VADD(TM, TO);
|
||||
ST(&(Rp[WS(rs, 4)]), TP, ms, &(Rp[0]));
|
||||
T19 = VCONJ(VSUB(T17, T16));
|
||||
ST(&(Rm[0]), T19, -ms, &(Rm[0]));
|
||||
TI = VCONJ(VSUB(TG, Tq));
|
||||
ST(&(Rm[WS(rs, 1)]), TI, -ms, &(Rm[WS(rs, 1)]));
|
||||
T18 = VADD(T16, T17);
|
||||
ST(&(Rp[0]), T18, ms, &(Rp[0]));
|
||||
T14 = VCONJ(VSUB(T12, T10));
|
||||
ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
|
||||
TY = VCONJ(VSUB(TW, TS));
|
||||
ST(&(Rm[WS(rs, 2)]), TY, -ms, &(Rm[0]));
|
||||
TQ = VCONJ(VSUB(TO, TM));
|
||||
ST(&(Rm[WS(rs, 4)]), TQ, -ms, &(Rm[0]));
|
||||
T13 = VADD(T10, T12);
|
||||
ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
|
||||
TX = VADD(TS, TW);
|
||||
ST(&(Rp[WS(rs, 2)]), TX, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cbdftv_10"), twinstr, &GENUS, { 33, 22, 28, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dif -sign 1 -name hc2cbdftv_10 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 61 FP additions, 30 FP multiplications,
|
||||
* (or, 55 additions, 24 multiplications, 6 fused multiply/add),
|
||||
* 81 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
V T5, TE, Ts, Tt, TC, Tz, TH, TJ, To, Tq, T2, T4, T3, T9, Tx;
|
||||
V Tm, TB, Td, Ty, Ti, TA, T6, T8, T7, Tl, Tk, Tj, Tc, Tb, Ta;
|
||||
V Tf, Th, Tg, TF, TG, Te, Tn;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T4 = VCONJ(T3);
|
||||
T5 = VSUB(T2, T4);
|
||||
TE = VADD(T2, T4);
|
||||
T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
T9 = VSUB(T6, T8);
|
||||
Tx = VADD(T6, T8);
|
||||
Tl = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tk = VCONJ(Tj);
|
||||
Tm = VSUB(Tk, Tl);
|
||||
TB = VADD(Tk, Tl);
|
||||
Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VCONJ(Ta);
|
||||
Td = VSUB(Tb, Tc);
|
||||
Ty = VADD(Tb, Tc);
|
||||
Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Tg = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Th = VCONJ(Tg);
|
||||
Ti = VSUB(Tf, Th);
|
||||
TA = VADD(Tf, Th);
|
||||
Ts = VSUB(T9, Td);
|
||||
Tt = VSUB(Ti, Tm);
|
||||
TC = VSUB(TA, TB);
|
||||
Tz = VSUB(Tx, Ty);
|
||||
TF = VADD(Tx, Ty);
|
||||
TG = VADD(TA, TB);
|
||||
TH = VADD(TF, TG);
|
||||
TJ = VMUL(LDK(KP559016994), VSUB(TF, TG));
|
||||
Te = VADD(T9, Td);
|
||||
Tn = VADD(Ti, Tm);
|
||||
To = VADD(Te, Tn);
|
||||
Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
|
||||
{
|
||||
V T1c, TX, Tv, T1b, TR, T15, TL, T17, TT, T11, TW, Tu, TQ, Tr, TP;
|
||||
V Tp, T1, T1a, TO, T14, TD, T10, TK, TZ, TI, Tw, T16, TS, TY, TM;
|
||||
V TU, T1e, TN, T1d, T19, T13, TV, T18, T12;
|
||||
T1c = VADD(TE, TH);
|
||||
TW = LDW(&(W[TWVL * 8]));
|
||||
TX = VZMULI(TW, VADD(T5, To));
|
||||
Tu = VBYI(VFNMS(LDK(KP951056516), Tt, VMUL(LDK(KP587785252), Ts)));
|
||||
TQ = VBYI(VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tt)));
|
||||
Tp = VFNMS(LDK(KP250000000), To, T5);
|
||||
Tr = VSUB(Tp, Tq);
|
||||
TP = VADD(Tq, Tp);
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
Tv = VZMULI(T1, VSUB(Tr, Tu));
|
||||
T1a = LDW(&(W[0]));
|
||||
T1b = VZMULI(T1a, VADD(TQ, TP));
|
||||
TO = LDW(&(W[TWVL * 16]));
|
||||
TR = VZMULI(TO, VSUB(TP, TQ));
|
||||
T14 = LDW(&(W[TWVL * 12]));
|
||||
T15 = VZMULI(T14, VADD(Tu, Tr));
|
||||
TD = VBYI(VFNMS(LDK(KP951056516), TC, VMUL(LDK(KP587785252), Tz)));
|
||||
T10 = VBYI(VFMA(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), TC)));
|
||||
TI = VFNMS(LDK(KP250000000), TH, TE);
|
||||
TK = VSUB(TI, TJ);
|
||||
TZ = VADD(TJ, TI);
|
||||
Tw = LDW(&(W[TWVL * 2]));
|
||||
TL = VZMUL(Tw, VADD(TD, TK));
|
||||
T16 = LDW(&(W[TWVL * 10]));
|
||||
T17 = VZMUL(T16, VADD(T10, TZ));
|
||||
TS = LDW(&(W[TWVL * 14]));
|
||||
TT = VZMUL(TS, VSUB(TK, TD));
|
||||
TY = LDW(&(W[TWVL * 6]));
|
||||
T11 = VZMUL(TY, VSUB(TZ, T10));
|
||||
TM = VADD(Tv, TL);
|
||||
ST(&(Rp[WS(rs, 1)]), TM, ms, &(Rp[WS(rs, 1)]));
|
||||
TU = VADD(TR, TT);
|
||||
ST(&(Rp[WS(rs, 4)]), TU, ms, &(Rp[0]));
|
||||
T1e = VCONJ(VSUB(T1c, T1b));
|
||||
ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
|
||||
TN = VCONJ(VSUB(TL, Tv));
|
||||
ST(&(Rm[WS(rs, 1)]), TN, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1d = VADD(T1b, T1c);
|
||||
ST(&(Rp[0]), T1d, ms, &(Rp[0]));
|
||||
T19 = VCONJ(VSUB(T17, T15));
|
||||
ST(&(Rm[WS(rs, 3)]), T19, -ms, &(Rm[WS(rs, 1)]));
|
||||
T13 = VCONJ(VSUB(T11, TX));
|
||||
ST(&(Rm[WS(rs, 2)]), T13, -ms, &(Rm[0]));
|
||||
TV = VCONJ(VSUB(TT, TR));
|
||||
ST(&(Rm[WS(rs, 4)]), TV, -ms, &(Rm[0]));
|
||||
T18 = VADD(T15, T17);
|
||||
ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
|
||||
T12 = VADD(TX, T11);
|
||||
ST(&(Rp[WS(rs, 2)]), T12, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cbdftv_10"), twinstr, &GENUS, { 55, 24, 6, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
327
fftw-3.3.10/rdft/simd/common/hc2cbdftv_12.c
Normal file
327
fftw-3.3.10/rdft/simd/common/hc2cbdftv_12.c
Normal file
@@ -0,0 +1,327 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dif -sign 1 -name hc2cbdftv_12 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 71 FP additions, 51 FP multiplications,
|
||||
* (or, 45 additions, 25 multiplications, 26 fused multiply/add),
|
||||
* 56 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
V Tk, Tw, Td, TA, T11, T1f, TF, TP, Tt, TB, TY, T1e;
|
||||
{
|
||||
V T2, Tm, T7, T8, Tp, Tq, T5, Tu, Tg, Tr, Tj, Tn, Tb, Tv, T3;
|
||||
V T4, Te, Tf, Th, Ti, T9, Ta, T6, Tc, TZ, T10, TD, TE, To, Ts;
|
||||
V TW, TX;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
Tm = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T8 = VCONJ(T7);
|
||||
Tp = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tq = VCONJ(Tp);
|
||||
T3 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T4 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T5 = VFMACONJ(T4, T3);
|
||||
Tu = VFNMSCONJ(T4, T3);
|
||||
Te = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tf = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tg = VSUB(Te, Tf);
|
||||
Tr = VADD(Te, Tf);
|
||||
Th = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Ti = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Tj = VSUB(Th, Ti);
|
||||
Tn = VADD(Ti, Th);
|
||||
T9 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VFMACONJ(Ta, T9);
|
||||
Tv = VFMSCONJ(Ta, T9);
|
||||
Tk = VFMACONJ(Tj, Tg);
|
||||
Tw = VSUB(Tu, Tv);
|
||||
T6 = VFNMS(LDK(KP500000000), T5, T2);
|
||||
Tc = VFNMS(LDK(KP500000000), Tb, T8);
|
||||
Td = VSUB(T6, Tc);
|
||||
TA = VADD(T6, Tc);
|
||||
TZ = VFMACONJ(Tn, Tm);
|
||||
T10 = VFMACONJ(Tp, Tr);
|
||||
T11 = VSUB(TZ, T10);
|
||||
T1f = VADD(TZ, T10);
|
||||
TD = VFNMSCONJ(Tj, Tg);
|
||||
TE = VADD(Tu, Tv);
|
||||
TF = VMUL(LDK(KP866025403), VSUB(TD, TE));
|
||||
TP = VMUL(LDK(KP866025403), VADD(TE, TD));
|
||||
To = VFNMS(LDK(KP500000000), VCONJ(Tn), Tm);
|
||||
Ts = VFNMS(LDK(KP500000000), Tr, Tq);
|
||||
Tt = VSUB(To, Ts);
|
||||
TB = VADD(To, Ts);
|
||||
TW = VADD(T2, T5);
|
||||
TX = VFMACONJ(T7, Tb);
|
||||
TY = VSUB(TW, TX);
|
||||
T1e = VADD(TW, TX);
|
||||
}
|
||||
{
|
||||
V T1l, T12, TG, TU, Ty, T1k, TV, TC, Tz, TT, Tl, Tx, T1, T1j, TH;
|
||||
V TI, T1n, T1m, T14, T13, T18, T1g, TQ, T16, TM, T1c, T17, T1d, TO, TN;
|
||||
V T15, TK, TL, TJ, T1b, TR, TS, T1i, T1h, T1a, T19;
|
||||
T1l = VADD(T1e, T1f);
|
||||
TV = LDW(&(W[TWVL * 4]));
|
||||
T12 = VZMULI(TV, VFNMSI(T11, TY));
|
||||
TC = VSUB(TA, TB);
|
||||
Tz = LDW(&(W[TWVL * 18]));
|
||||
TG = VZMUL(Tz, VFNMSI(TF, TC));
|
||||
TT = LDW(&(W[TWVL * 2]));
|
||||
TU = VZMUL(TT, VFMAI(TF, TC));
|
||||
Tl = VFMA(LDK(KP866025403), Tk, Td);
|
||||
Tx = VFMA(LDK(KP866025403), Tw, Tt);
|
||||
T1 = LDW(&(W[TWVL * 20]));
|
||||
Ty = VZMULI(T1, VFNMSI(Tx, Tl));
|
||||
T1j = LDW(&(W[0]));
|
||||
T1k = VZMULI(T1j, VFMAI(Tx, Tl));
|
||||
TH = VADD(Ty, TG);
|
||||
ST(&(Rp[WS(rs, 5)]), TH, ms, &(Rp[WS(rs, 1)]));
|
||||
TI = VCONJ(VSUB(TG, Ty));
|
||||
ST(&(Rm[WS(rs, 5)]), TI, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1n = VCONJ(VSUB(T1l, T1k));
|
||||
ST(&(Rm[0]), T1n, -ms, &(Rm[0]));
|
||||
T1m = VADD(T1k, T1l);
|
||||
ST(&(Rp[0]), T1m, ms, &(Rp[0]));
|
||||
T14 = VADD(TU, T12);
|
||||
ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
|
||||
T13 = VCONJ(VSUB(TU, T12));
|
||||
ST(&(Rm[WS(rs, 1)]), T13, -ms, &(Rm[WS(rs, 1)]));
|
||||
T17 = LDW(&(W[TWVL * 16]));
|
||||
T18 = VZMULI(T17, VFMAI(T11, TY));
|
||||
T1d = LDW(&(W[TWVL * 10]));
|
||||
T1g = VZMUL(T1d, VSUB(T1e, T1f));
|
||||
TO = VADD(TA, TB);
|
||||
TN = LDW(&(W[TWVL * 6]));
|
||||
TQ = VZMUL(TN, VFMAI(TP, TO));
|
||||
T15 = LDW(&(W[TWVL * 14]));
|
||||
T16 = VZMUL(T15, VFNMSI(TP, TO));
|
||||
TK = VFNMS(LDK(KP866025403), Tk, Td);
|
||||
TL = VFNMS(LDK(KP866025403), Tw, Tt);
|
||||
TJ = LDW(&(W[TWVL * 8]));
|
||||
TM = VZMULI(TJ, VFMAI(TL, TK));
|
||||
T1b = LDW(&(W[TWVL * 12]));
|
||||
T1c = VZMULI(T1b, VFNMSI(TL, TK));
|
||||
TR = VADD(TM, TQ);
|
||||
ST(&(Rp[WS(rs, 2)]), TR, ms, &(Rp[0]));
|
||||
TS = VCONJ(VSUB(TQ, TM));
|
||||
ST(&(Rm[WS(rs, 2)]), TS, -ms, &(Rm[0]));
|
||||
T1i = VCONJ(VSUB(T1g, T1c));
|
||||
ST(&(Rm[WS(rs, 3)]), T1i, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1h = VADD(T1c, T1g);
|
||||
ST(&(Rp[WS(rs, 3)]), T1h, ms, &(Rp[WS(rs, 1)]));
|
||||
T1a = VADD(T16, T18);
|
||||
ST(&(Rp[WS(rs, 4)]), T1a, ms, &(Rp[0]));
|
||||
T19 = VCONJ(VSUB(T16, T18));
|
||||
ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cbdftv_12"), twinstr, &GENUS, { 45, 25, 26, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dif -sign 1 -name hc2cbdftv_12 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 71 FP additions, 30 FP multiplications,
|
||||
* (or, 67 additions, 26 multiplications, 4 fused multiply/add),
|
||||
* 90 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
V TY, TZ, Tf, TC, Tq, TG, Tm, TF, Ty, TD, T13, T1h, T2, T9, T3;
|
||||
V T5, T6, Tc, Tb, Td, T8, T4, Ta, T7, Te, To, Tp, Tr, Tv, Ti;
|
||||
V Ts, Tl, Tw, Tu, Tg, Th, Tj, Tk, Tt, Tx, T11, T12;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T8 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T9 = VCONJ(T8);
|
||||
T3 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T4 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T5 = VCONJ(T4);
|
||||
T6 = VADD(T3, T5);
|
||||
Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VCONJ(Ta);
|
||||
Td = VADD(Tb, Tc);
|
||||
TY = VADD(T2, T6);
|
||||
TZ = VADD(T9, Td);
|
||||
T7 = VFNMS(LDK(KP500000000), T6, T2);
|
||||
Te = VFNMS(LDK(KP500000000), Td, T9);
|
||||
Tf = VSUB(T7, Te);
|
||||
TC = VADD(T7, Te);
|
||||
To = VSUB(T3, T5);
|
||||
Tp = VSUB(Tb, Tc);
|
||||
Tq = VMUL(LDK(KP866025403), VSUB(To, Tp));
|
||||
TG = VADD(To, Tp);
|
||||
Tr = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tu = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tv = VCONJ(Tu);
|
||||
Tg = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Th = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Ti = VCONJ(VSUB(Tg, Th));
|
||||
Ts = VCONJ(VADD(Tg, Th));
|
||||
Tj = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tl = VSUB(Tj, Tk);
|
||||
Tw = VADD(Tj, Tk);
|
||||
Tm = VMUL(LDK(KP866025403), VSUB(Ti, Tl));
|
||||
TF = VADD(Ti, Tl);
|
||||
Tt = VFNMS(LDK(KP500000000), Ts, Tr);
|
||||
Tx = VFNMS(LDK(KP500000000), Tw, Tv);
|
||||
Ty = VSUB(Tt, Tx);
|
||||
TD = VADD(Tt, Tx);
|
||||
T11 = VADD(Tr, Ts);
|
||||
T12 = VADD(Tv, Tw);
|
||||
T13 = VBYI(VSUB(T11, T12));
|
||||
T1h = VADD(T11, T12);
|
||||
{
|
||||
V T1n, T1i, T14, T1a, TA, T1m, TS, T18, TO, T1e, TI, TW, T1g, T1f, T10;
|
||||
V TX, T19, Tn, Tz, T1, T1l, TQ, TR, TP, T17, TM, TN, TL, T1d, TE;
|
||||
V TH, TB, TV, TJ, T1p, T1k, TT, T1o, TK, TU, T1j, T1b, T16, T1c, T15;
|
||||
T1g = VADD(TY, TZ);
|
||||
T1n = VADD(T1g, T1h);
|
||||
T1f = LDW(&(W[TWVL * 10]));
|
||||
T1i = VZMUL(T1f, VSUB(T1g, T1h));
|
||||
T10 = VSUB(TY, TZ);
|
||||
TX = LDW(&(W[TWVL * 4]));
|
||||
T14 = VZMULI(TX, VSUB(T10, T13));
|
||||
T19 = LDW(&(W[TWVL * 16]));
|
||||
T1a = VZMULI(T19, VADD(T10, T13));
|
||||
Tn = VSUB(Tf, Tm);
|
||||
Tz = VBYI(VADD(Tq, Ty));
|
||||
T1 = LDW(&(W[TWVL * 20]));
|
||||
TA = VZMULI(T1, VSUB(Tn, Tz));
|
||||
T1l = LDW(&(W[0]));
|
||||
T1m = VZMULI(T1l, VADD(Tn, Tz));
|
||||
TQ = VBYI(VMUL(LDK(KP866025403), VADD(TG, TF)));
|
||||
TR = VADD(TC, TD);
|
||||
TP = LDW(&(W[TWVL * 6]));
|
||||
TS = VZMUL(TP, VADD(TQ, TR));
|
||||
T17 = LDW(&(W[TWVL * 14]));
|
||||
T18 = VZMUL(T17, VSUB(TR, TQ));
|
||||
TM = VADD(Tf, Tm);
|
||||
TN = VBYI(VSUB(Ty, Tq));
|
||||
TL = LDW(&(W[TWVL * 8]));
|
||||
TO = VZMULI(TL, VADD(TM, TN));
|
||||
T1d = LDW(&(W[TWVL * 12]));
|
||||
T1e = VZMULI(T1d, VSUB(TM, TN));
|
||||
TE = VSUB(TC, TD);
|
||||
TH = VBYI(VMUL(LDK(KP866025403), VSUB(TF, TG)));
|
||||
TB = LDW(&(W[TWVL * 18]));
|
||||
TI = VZMUL(TB, VSUB(TE, TH));
|
||||
TV = LDW(&(W[TWVL * 2]));
|
||||
TW = VZMUL(TV, VADD(TH, TE));
|
||||
TJ = VADD(TA, TI);
|
||||
ST(&(Rp[WS(rs, 5)]), TJ, ms, &(Rp[WS(rs, 1)]));
|
||||
T1p = VCONJ(VSUB(T1n, T1m));
|
||||
ST(&(Rm[0]), T1p, -ms, &(Rm[0]));
|
||||
T1k = VCONJ(VSUB(T1i, T1e));
|
||||
ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)]));
|
||||
TT = VADD(TO, TS);
|
||||
ST(&(Rp[WS(rs, 2)]), TT, ms, &(Rp[0]));
|
||||
T1o = VADD(T1m, T1n);
|
||||
ST(&(Rp[0]), T1o, ms, &(Rp[0]));
|
||||
TK = VCONJ(VSUB(TI, TA));
|
||||
ST(&(Rm[WS(rs, 5)]), TK, -ms, &(Rm[WS(rs, 1)]));
|
||||
TU = VCONJ(VSUB(TS, TO));
|
||||
ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0]));
|
||||
T1j = VADD(T1e, T1i);
|
||||
ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
|
||||
T1b = VCONJ(VSUB(T18, T1a));
|
||||
ST(&(Rm[WS(rs, 4)]), T1b, -ms, &(Rm[0]));
|
||||
T16 = VADD(TW, T14);
|
||||
ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
|
||||
T1c = VADD(T18, T1a);
|
||||
ST(&(Rp[WS(rs, 4)]), T1c, ms, &(Rp[0]));
|
||||
T15 = VCONJ(VSUB(TW, T14));
|
||||
ST(&(Rm[WS(rs, 1)]), T15, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cbdftv_12"), twinstr, &GENUS, { 67, 26, 4, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
428
fftw-3.3.10/rdft/simd/common/hc2cbdftv_16.c
Normal file
428
fftw-3.3.10/rdft/simd/common/hc2cbdftv_16.c
Normal file
@@ -0,0 +1,428 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 103 FP additions, 80 FP multiplications,
|
||||
* (or, 53 additions, 30 multiplications, 50 fused multiply/add),
|
||||
* 79 stack variables, 3 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
V T8, Tv, TE, T1t, TP, T1w, T10, T1p, Tn, Tw, T13, T1q, TL, T1x, TS;
|
||||
V T1u;
|
||||
{
|
||||
V T4, TA, Tu, TC, T7, TN, Tr, TB, T2, T3, Ts, Tt, T5, T6, Tp;
|
||||
V Tq, TD, TO, TY, TZ, Tb, TF, Tl, TJ, Te, TG, Ti, TI, T9, Ta;
|
||||
V Tj, Tk, Tc, Td, Tg, Th, Tf, Tm, T11, T12, TH, TK, TQ, TR;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VFMACONJ(T3, T2);
|
||||
TA = VFNMSCONJ(T3, T2);
|
||||
Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tu = VFMACONJ(Tt, Ts);
|
||||
TC = VFMSCONJ(Tt, Ts);
|
||||
T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T7 = VFMACONJ(T6, T5);
|
||||
TN = VFNMSCONJ(T6, T5);
|
||||
Tp = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Tq = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tr = VFMACONJ(Tq, Tp);
|
||||
TB = VFNMSCONJ(Tq, Tp);
|
||||
T8 = VSUB(T4, T7);
|
||||
Tv = VSUB(Tr, Tu);
|
||||
TD = VADD(TB, TC);
|
||||
TE = VFMA(LDK(KP707106781), TD, TA);
|
||||
T1t = VFNMS(LDK(KP707106781), TD, TA);
|
||||
TO = VSUB(TB, TC);
|
||||
TP = VFMA(LDK(KP707106781), TO, TN);
|
||||
T1w = VFNMS(LDK(KP707106781), TO, TN);
|
||||
TY = VADD(T4, T7);
|
||||
TZ = VADD(Tr, Tu);
|
||||
T10 = VADD(TY, TZ);
|
||||
T1p = VSUB(TY, TZ);
|
||||
T9 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Ta = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
Tb = VFMACONJ(Ta, T9);
|
||||
TF = VFNMSCONJ(Ta, T9);
|
||||
Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tk = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Tl = VFMACONJ(Tk, Tj);
|
||||
TJ = VFNMSCONJ(Tk, Tj);
|
||||
Tc = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Te = VFMACONJ(Td, Tc);
|
||||
TG = VFNMSCONJ(Td, Tc);
|
||||
Tg = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Th = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Ti = VFMACONJ(Th, Tg);
|
||||
TI = VFMSCONJ(Th, Tg);
|
||||
Tf = VSUB(Tb, Te);
|
||||
Tm = VSUB(Ti, Tl);
|
||||
Tn = VADD(Tf, Tm);
|
||||
Tw = VSUB(Tf, Tm);
|
||||
T11 = VADD(Tb, Te);
|
||||
T12 = VADD(Ti, Tl);
|
||||
T13 = VADD(T11, T12);
|
||||
T1q = VSUB(T11, T12);
|
||||
TH = VFNMS(LDK(KP414213562), TG, TF);
|
||||
TK = VFMA(LDK(KP414213562), TJ, TI);
|
||||
TL = VADD(TH, TK);
|
||||
T1x = VSUB(TH, TK);
|
||||
TQ = VFMA(LDK(KP414213562), TF, TG);
|
||||
TR = VFNMS(LDK(KP414213562), TI, TJ);
|
||||
TS = VADD(TQ, TR);
|
||||
T1u = VSUB(TQ, TR);
|
||||
}
|
||||
{
|
||||
V T1j, T1R, T1c, T1J, T1g, T1l, T1N, T1T, T1Q, T1a, T1b, T19, T1I, T1e, T1f;
|
||||
V T1d, T1k, T1L, T1M, T1K, T1S, T1h, T1U, T1V, T1i, T1m, T1O, T1P, T1n, T14;
|
||||
V T1r, Ty, T1D, TU, T16, T1z, T1F, TX, T1o, To, Tx, T1, T1C, TM, TT;
|
||||
V Tz, T15, T1v, T1y, T1s, T1E, TV, T1G, T1H, TW, T17, T1A, T1B, T18;
|
||||
T1j = VADD(T10, T13);
|
||||
T1Q = LDW(&(W[TWVL * 22]));
|
||||
T1R = VZMUL(T1Q, VFNMSI(T1q, T1p));
|
||||
T1a = VFMA(LDK(KP707106781), Tn, T8);
|
||||
T1b = VFMA(LDK(KP707106781), Tw, Tv);
|
||||
T19 = LDW(&(W[TWVL * 26]));
|
||||
T1c = VZMUL(T19, VFNMSI(T1b, T1a));
|
||||
T1I = LDW(&(W[TWVL * 2]));
|
||||
T1J = VZMUL(T1I, VFMAI(T1b, T1a));
|
||||
T1e = VFMA(LDK(KP923879532), TL, TE);
|
||||
T1f = VFMA(LDK(KP923879532), TS, TP);
|
||||
T1d = LDW(&(W[TWVL * 28]));
|
||||
T1g = VZMULI(T1d, VFNMSI(T1f, T1e));
|
||||
T1k = LDW(&(W[0]));
|
||||
T1l = VZMULI(T1k, VFMAI(T1f, T1e));
|
||||
T1L = VFMA(LDK(KP923879532), T1u, T1t);
|
||||
T1M = VFNMS(LDK(KP923879532), T1x, T1w);
|
||||
T1K = LDW(&(W[TWVL * 4]));
|
||||
T1N = VZMULI(T1K, VFNMSI(T1M, T1L));
|
||||
T1S = LDW(&(W[TWVL * 24]));
|
||||
T1T = VZMULI(T1S, VFMAI(T1M, T1L));
|
||||
T1h = VCONJ(VSUB(T1c, T1g));
|
||||
ST(&(Rm[WS(rs, 7)]), T1h, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1U = VCONJ(VSUB(T1R, T1T));
|
||||
ST(&(Rm[WS(rs, 6)]), T1U, -ms, &(Rm[0]));
|
||||
T1V = VADD(T1R, T1T);
|
||||
ST(&(Rp[WS(rs, 6)]), T1V, ms, &(Rp[0]));
|
||||
T1i = VADD(T1c, T1g);
|
||||
ST(&(Rp[WS(rs, 7)]), T1i, ms, &(Rp[WS(rs, 1)]));
|
||||
T1m = VCONJ(VSUB(T1j, T1l));
|
||||
ST(&(Rm[0]), T1m, -ms, &(Rm[0]));
|
||||
T1O = VCONJ(VSUB(T1J, T1N));
|
||||
ST(&(Rm[WS(rs, 1)]), T1O, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1P = VADD(T1J, T1N);
|
||||
ST(&(Rp[WS(rs, 1)]), T1P, ms, &(Rp[WS(rs, 1)]));
|
||||
T1n = VADD(T1j, T1l);
|
||||
ST(&(Rp[0]), T1n, ms, &(Rp[0]));
|
||||
TX = LDW(&(W[TWVL * 14]));
|
||||
T14 = VZMUL(TX, VSUB(T10, T13));
|
||||
T1o = LDW(&(W[TWVL * 6]));
|
||||
T1r = VZMUL(T1o, VFMAI(T1q, T1p));
|
||||
To = VFNMS(LDK(KP707106781), Tn, T8);
|
||||
Tx = VFNMS(LDK(KP707106781), Tw, Tv);
|
||||
T1 = LDW(&(W[TWVL * 10]));
|
||||
Ty = VZMUL(T1, VFNMSI(Tx, To));
|
||||
T1C = LDW(&(W[TWVL * 18]));
|
||||
T1D = VZMUL(T1C, VFMAI(Tx, To));
|
||||
TM = VFNMS(LDK(KP923879532), TL, TE);
|
||||
TT = VFNMS(LDK(KP923879532), TS, TP);
|
||||
Tz = LDW(&(W[TWVL * 12]));
|
||||
TU = VZMULI(Tz, VFNMSI(TT, TM));
|
||||
T15 = LDW(&(W[TWVL * 16]));
|
||||
T16 = VZMULI(T15, VFMAI(TT, TM));
|
||||
T1v = VFNMS(LDK(KP923879532), T1u, T1t);
|
||||
T1y = VFMA(LDK(KP923879532), T1x, T1w);
|
||||
T1s = LDW(&(W[TWVL * 8]));
|
||||
T1z = VZMULI(T1s, VFMAI(T1y, T1v));
|
||||
T1E = LDW(&(W[TWVL * 20]));
|
||||
T1F = VZMULI(T1E, VFNMSI(T1y, T1v));
|
||||
TV = VCONJ(VSUB(Ty, TU));
|
||||
ST(&(Rm[WS(rs, 3)]), TV, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1G = VCONJ(VSUB(T1D, T1F));
|
||||
ST(&(Rm[WS(rs, 5)]), T1G, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1H = VADD(T1D, T1F);
|
||||
ST(&(Rp[WS(rs, 5)]), T1H, ms, &(Rp[WS(rs, 1)]));
|
||||
TW = VADD(Ty, TU);
|
||||
ST(&(Rp[WS(rs, 3)]), TW, ms, &(Rp[WS(rs, 1)]));
|
||||
T17 = VCONJ(VSUB(T14, T16));
|
||||
ST(&(Rm[WS(rs, 4)]), T17, -ms, &(Rm[0]));
|
||||
T1A = VCONJ(VSUB(T1r, T1z));
|
||||
ST(&(Rm[WS(rs, 2)]), T1A, -ms, &(Rm[0]));
|
||||
T1B = VADD(T1r, T1z);
|
||||
ST(&(Rp[WS(rs, 2)]), T1B, ms, &(Rp[0]));
|
||||
T18 = VADD(T14, T16);
|
||||
ST(&(Rp[WS(rs, 4)]), T18, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, { 53, 30, 50, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 103 FP additions, 42 FP multiplications,
|
||||
* (or, 99 additions, 38 multiplications, 4 fused multiply/add),
|
||||
* 83 stack variables, 3 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
V Tf, T16, TZ, T1C, TI, T1a, TV, T1D, T1F, T1G, Ty, T19, TC, T17, TS;
|
||||
V T10;
|
||||
{
|
||||
V T2, TD, T4, TF, Tc, Tb, Td, T6, T8, T9, T3, TE, Ta, T7, T5;
|
||||
V Te, TX, TY, TG, TH, TT, TU, Tj, TM, Tw, TQ, Tn, TN, Ts, TP;
|
||||
V Tg, Ti, Th, Tt, Tv, Tu, Tk, Tm, Tl, Tr, Tq, Tp, To, Tx, TA;
|
||||
V TB, TO, TR;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
TD = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VCONJ(T3);
|
||||
TE = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TF = VCONJ(TE);
|
||||
Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VCONJ(Ta);
|
||||
Td = VSUB(Tb, Tc);
|
||||
T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T8 = VCONJ(T7);
|
||||
T9 = VSUB(T6, T8);
|
||||
T5 = VSUB(T2, T4);
|
||||
Te = VMUL(LDK(KP707106781), VADD(T9, Td));
|
||||
Tf = VADD(T5, Te);
|
||||
T16 = VSUB(T5, Te);
|
||||
TX = VADD(T2, T4);
|
||||
TY = VADD(TD, TF);
|
||||
TZ = VSUB(TX, TY);
|
||||
T1C = VADD(TX, TY);
|
||||
TG = VSUB(TD, TF);
|
||||
TH = VMUL(LDK(KP707106781), VSUB(T9, Td));
|
||||
TI = VADD(TG, TH);
|
||||
T1a = VSUB(TH, TG);
|
||||
TT = VADD(T6, T8);
|
||||
TU = VADD(Tb, Tc);
|
||||
TV = VSUB(TT, TU);
|
||||
T1D = VADD(TT, TU);
|
||||
Tg = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Th = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
Ti = VCONJ(Th);
|
||||
Tj = VSUB(Tg, Ti);
|
||||
TM = VADD(Tg, Ti);
|
||||
Tt = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tu = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Tv = VCONJ(Tu);
|
||||
Tw = VSUB(Tt, Tv);
|
||||
TQ = VADD(Tt, Tv);
|
||||
Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tl = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tm = VCONJ(Tl);
|
||||
Tn = VSUB(Tk, Tm);
|
||||
TN = VADD(Tk, Tm);
|
||||
Tr = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tp = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Tq = VCONJ(Tp);
|
||||
Ts = VSUB(Tq, Tr);
|
||||
TP = VADD(Tq, Tr);
|
||||
T1F = VADD(TM, TN);
|
||||
T1G = VADD(TP, TQ);
|
||||
To = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
|
||||
Tx = VFMA(LDK(KP923879532), Ts, VMUL(LDK(KP382683432), Tw));
|
||||
Ty = VADD(To, Tx);
|
||||
T19 = VSUB(To, Tx);
|
||||
TA = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
|
||||
TB = VFNMS(LDK(KP382683432), Ts, VMUL(LDK(KP923879532), Tw));
|
||||
TC = VADD(TA, TB);
|
||||
T17 = VSUB(TA, TB);
|
||||
TO = VSUB(TM, TN);
|
||||
TR = VSUB(TP, TQ);
|
||||
TS = VMUL(LDK(KP707106781), VSUB(TO, TR));
|
||||
T10 = VMUL(LDK(KP707106781), VADD(TO, TR));
|
||||
}
|
||||
{
|
||||
V T21, T1W, T1u, T20, T1I, T1O, TK, T1S, T12, T1e, T1k, T1A, T1o, T1w, T1c;
|
||||
V T1M, T1U, T1V, T1T, T1s, T1t, T1r, T1Z, T1E, T1H, T1B, T1N, Tz, TJ, T1;
|
||||
V T1R, TW, T11, TL, T1d, T1i, T1j, T1h, T1z, T1m, T1n, T1l, T1v, T18, T1b;
|
||||
V T15, T1L, T13, T1g, T1X, T23, T14, T1f, T1Y, T22, T1p, T1y, T1J, T1Q, T1q;
|
||||
V T1x, T1K, T1P;
|
||||
T1U = VADD(T1C, T1D);
|
||||
T1V = VADD(T1F, T1G);
|
||||
T21 = VADD(T1U, T1V);
|
||||
T1T = LDW(&(W[TWVL * 14]));
|
||||
T1W = VZMUL(T1T, VSUB(T1U, T1V));
|
||||
T1s = VADD(Tf, Ty);
|
||||
T1t = VBYI(VADD(TI, TC));
|
||||
T1r = LDW(&(W[TWVL * 28]));
|
||||
T1u = VZMULI(T1r, VSUB(T1s, T1t));
|
||||
T1Z = LDW(&(W[0]));
|
||||
T20 = VZMULI(T1Z, VADD(T1s, T1t));
|
||||
T1E = VSUB(T1C, T1D);
|
||||
T1H = VBYI(VSUB(T1F, T1G));
|
||||
T1B = LDW(&(W[TWVL * 22]));
|
||||
T1I = VZMUL(T1B, VSUB(T1E, T1H));
|
||||
T1N = LDW(&(W[TWVL * 6]));
|
||||
T1O = VZMUL(T1N, VADD(T1E, T1H));
|
||||
Tz = VSUB(Tf, Ty);
|
||||
TJ = VBYI(VSUB(TC, TI));
|
||||
T1 = LDW(&(W[TWVL * 12]));
|
||||
TK = VZMULI(T1, VADD(Tz, TJ));
|
||||
T1R = LDW(&(W[TWVL * 16]));
|
||||
T1S = VZMULI(T1R, VSUB(Tz, TJ));
|
||||
TW = VBYI(VSUB(TS, TV));
|
||||
T11 = VSUB(TZ, T10);
|
||||
TL = LDW(&(W[TWVL * 10]));
|
||||
T12 = VZMUL(TL, VADD(TW, T11));
|
||||
T1d = LDW(&(W[TWVL * 18]));
|
||||
T1e = VZMUL(T1d, VSUB(T11, TW));
|
||||
T1i = VBYI(VADD(T1a, T19));
|
||||
T1j = VADD(T16, T17);
|
||||
T1h = LDW(&(W[TWVL * 4]));
|
||||
T1k = VZMULI(T1h, VADD(T1i, T1j));
|
||||
T1z = LDW(&(W[TWVL * 24]));
|
||||
T1A = VZMULI(T1z, VSUB(T1j, T1i));
|
||||
T1m = VBYI(VADD(TV, TS));
|
||||
T1n = VADD(TZ, T10);
|
||||
T1l = LDW(&(W[TWVL * 2]));
|
||||
T1o = VZMUL(T1l, VADD(T1m, T1n));
|
||||
T1v = LDW(&(W[TWVL * 26]));
|
||||
T1w = VZMUL(T1v, VSUB(T1n, T1m));
|
||||
T18 = VSUB(T16, T17);
|
||||
T1b = VBYI(VSUB(T19, T1a));
|
||||
T15 = LDW(&(W[TWVL * 20]));
|
||||
T1c = VZMULI(T15, VSUB(T18, T1b));
|
||||
T1L = LDW(&(W[TWVL * 8]));
|
||||
T1M = VZMULI(T1L, VADD(T1b, T18));
|
||||
T13 = VADD(TK, T12);
|
||||
ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
|
||||
T1g = VCONJ(VSUB(T1e, T1c));
|
||||
ST(&(Rm[WS(rs, 5)]), T1g, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1X = VADD(T1S, T1W);
|
||||
ST(&(Rp[WS(rs, 4)]), T1X, ms, &(Rp[0]));
|
||||
T23 = VCONJ(VSUB(T21, T20));
|
||||
ST(&(Rm[0]), T23, -ms, &(Rm[0]));
|
||||
T14 = VCONJ(VSUB(T12, TK));
|
||||
ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1f = VADD(T1c, T1e);
|
||||
ST(&(Rp[WS(rs, 5)]), T1f, ms, &(Rp[WS(rs, 1)]));
|
||||
T1Y = VCONJ(VSUB(T1W, T1S));
|
||||
ST(&(Rm[WS(rs, 4)]), T1Y, -ms, &(Rm[0]));
|
||||
T22 = VADD(T20, T21);
|
||||
ST(&(Rp[0]), T22, ms, &(Rp[0]));
|
||||
T1p = VADD(T1k, T1o);
|
||||
ST(&(Rp[WS(rs, 1)]), T1p, ms, &(Rp[WS(rs, 1)]));
|
||||
T1y = VCONJ(VSUB(T1w, T1u));
|
||||
ST(&(Rm[WS(rs, 7)]), T1y, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1J = VADD(T1A, T1I);
|
||||
ST(&(Rp[WS(rs, 6)]), T1J, ms, &(Rp[0]));
|
||||
T1Q = VCONJ(VSUB(T1O, T1M));
|
||||
ST(&(Rm[WS(rs, 2)]), T1Q, -ms, &(Rm[0]));
|
||||
T1q = VCONJ(VSUB(T1o, T1k));
|
||||
ST(&(Rm[WS(rs, 1)]), T1q, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1x = VADD(T1u, T1w);
|
||||
ST(&(Rp[WS(rs, 7)]), T1x, ms, &(Rp[WS(rs, 1)]));
|
||||
T1K = VCONJ(VSUB(T1I, T1A));
|
||||
ST(&(Rm[WS(rs, 6)]), T1K, -ms, &(Rm[0]));
|
||||
T1P = VADD(T1M, T1O);
|
||||
ST(&(Rp[WS(rs, 2)]), T1P, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, { 99, 38, 4, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
109
fftw-3.3.10/rdft/simd/common/hc2cbdftv_2.c
Normal file
109
fftw-3.3.10/rdft/simd/common/hc2cbdftv_2.c
Normal file
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dif -sign 1 -name hc2cbdftv_2 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 5 FP additions, 4 FP multiplications,
|
||||
* (or, 3 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 8 stack variables, 0 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
V T5, T4, T2, T3, T1, T6, T7;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T5 = VFMACONJ(T3, T2);
|
||||
T1 = LDW(&(W[0]));
|
||||
T4 = VZMULI(T1, VFNMSCONJ(T3, T2));
|
||||
T6 = VADD(T4, T5);
|
||||
ST(&(Rp[0]), T6, ms, &(Rp[0]));
|
||||
T7 = VCONJ(VSUB(T5, T4));
|
||||
ST(&(Rm[0]), T7, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cbdftv_2"), twinstr, &GENUS, { 3, 2, 2, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dif -sign 1 -name hc2cbdftv_2 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 5 FP additions, 2 FP multiplications,
|
||||
* (or, 5 additions, 2 multiplications, 0 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
V T6, T5, T2, T4, T3, T1, T7, T8;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T4 = VCONJ(T3);
|
||||
T6 = VADD(T2, T4);
|
||||
T1 = LDW(&(W[0]));
|
||||
T5 = VZMULI(T1, VSUB(T2, T4));
|
||||
T7 = VADD(T5, T6);
|
||||
ST(&(Rp[0]), T7, ms, &(Rp[0]));
|
||||
T8 = VCONJ(VSUB(T6, T5));
|
||||
ST(&(Rm[0]), T8, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cbdftv_2"), twinstr, &GENUS, { 5, 2, 0, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
545
fftw-3.3.10/rdft/simd/common/hc2cbdftv_20.c
Normal file
545
fftw-3.3.10/rdft/simd/common/hc2cbdftv_20.c
Normal file
@@ -0,0 +1,545 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 143 FP additions, 108 FP multiplications,
|
||||
* (or, 77 additions, 42 multiplications, 66 fused multiply/add),
|
||||
* 110 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
|
||||
V T4, TF, Tl, T2a, T1d, T1Y, T29, TK, TU, T1e, Tj, Tk, TI, TJ, T19;
|
||||
V T1b, T25, T27, TB, T1l, TO, T1o;
|
||||
{
|
||||
V TS, TT, T7, Tz, Ta, Tw, Tb, TG, T20, T1Z, T10, TX, Te, Ts, Th;
|
||||
V Tp, Ti, TH, T23, T22, T17, T14, T2, T3, TD, TE, TV, TZ, TY, TW;
|
||||
V T5, T6, Tx, Ty, T8, T9, Tu, Tv, T12, T16, T15, T13, Tc, Td, Tq;
|
||||
V Tr, Tf, Tg, Tn, To, T11, T18, T21, T24, Tt, TA, TM, TN;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VFNMSCONJ(T3, T2);
|
||||
TS = VFMACONJ(T3, T2);
|
||||
TD = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TE = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
TF = VFNMSCONJ(TE, TD);
|
||||
TT = VFMACONJ(TE, TD);
|
||||
T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T7 = VFNMSCONJ(T6, T5);
|
||||
TV = VFMACONJ(T6, T5);
|
||||
Tx = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Ty = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
Tz = VFNMSCONJ(Ty, Tx);
|
||||
TZ = VFMACONJ(Ty, Tx);
|
||||
T8 = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ta = VFMSCONJ(T9, T8);
|
||||
TY = VFMACONJ(T9, T8);
|
||||
Tu = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tv = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Tw = VFNMSCONJ(Tv, Tu);
|
||||
TW = VFMACONJ(Tv, Tu);
|
||||
Tb = VADD(T7, Ta);
|
||||
TG = VADD(Tw, Tz);
|
||||
T20 = VADD(TY, TZ);
|
||||
T1Z = VADD(TV, TW);
|
||||
T10 = VSUB(TY, TZ);
|
||||
TX = VSUB(TV, TW);
|
||||
Tc = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Te = VFNMSCONJ(Td, Tc);
|
||||
T12 = VFMACONJ(Td, Tc);
|
||||
Tq = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tr = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Ts = VFMSCONJ(Tr, Tq);
|
||||
T16 = VFMACONJ(Tr, Tq);
|
||||
Tf = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Tg = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Th = VFMSCONJ(Tg, Tf);
|
||||
T15 = VFMACONJ(Tg, Tf);
|
||||
Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
Tp = VFMSCONJ(To, Tn);
|
||||
T13 = VFMACONJ(To, Tn);
|
||||
Ti = VADD(Te, Th);
|
||||
TH = VADD(Tp, Ts);
|
||||
T23 = VADD(T15, T16);
|
||||
T22 = VADD(T12, T13);
|
||||
T17 = VSUB(T15, T16);
|
||||
T14 = VSUB(T12, T13);
|
||||
Tl = VSUB(Tb, Ti);
|
||||
T2a = VSUB(T22, T23);
|
||||
T1d = VSUB(T14, T17);
|
||||
T1Y = VADD(TS, TT);
|
||||
T29 = VSUB(T1Z, T20);
|
||||
TK = VSUB(TG, TH);
|
||||
TU = VSUB(TS, TT);
|
||||
T1e = VSUB(TX, T10);
|
||||
Tj = VADD(Tb, Ti);
|
||||
Tk = VFNMS(LDK(KP250000000), Tj, T4);
|
||||
TI = VADD(TG, TH);
|
||||
TJ = VFNMS(LDK(KP250000000), TI, TF);
|
||||
T11 = VADD(TX, T10);
|
||||
T18 = VADD(T14, T17);
|
||||
T19 = VADD(T11, T18);
|
||||
T1b = VSUB(T11, T18);
|
||||
T21 = VADD(T1Z, T20);
|
||||
T24 = VADD(T22, T23);
|
||||
T25 = VADD(T21, T24);
|
||||
T27 = VSUB(T21, T24);
|
||||
Tt = VSUB(Tp, Ts);
|
||||
TA = VSUB(Tw, Tz);
|
||||
TB = VFNMS(LDK(KP618033988), TA, Tt);
|
||||
T1l = VFMA(LDK(KP618033988), Tt, TA);
|
||||
TM = VSUB(Te, Th);
|
||||
TN = VSUB(T7, Ta);
|
||||
TO = VFNMS(LDK(KP618033988), TN, TM);
|
||||
T1o = VFMA(LDK(KP618033988), TM, TN);
|
||||
}
|
||||
{
|
||||
V T2B, T1S, T1I, T1W, T2c, T2w, T2i, T2q, T1g, T1K, T1s, T1C, T1q, T2A, T1Q;
|
||||
V T2m, TQ, T2u, T1y, T2g, T1R, T1G, T1H, T1F, T1V, T1h, T1i, T2s, T2D, T1D;
|
||||
V T2x, T2y, T2C, T1u, T1t, T1E, T1L, T2d, T2r, T1U, T2e, T2j, T2k, T1T, T1M;
|
||||
T2B = VADD(T1Y, T25);
|
||||
T1R = LDW(&(W[TWVL * 18]));
|
||||
T1S = VZMUL(T1R, VADD(TU, T19));
|
||||
T1G = VADD(T4, Tj);
|
||||
T1H = VADD(TF, TI);
|
||||
T1F = LDW(&(W[TWVL * 28]));
|
||||
T1I = VZMULI(T1F, VFNMSI(T1H, T1G));
|
||||
T1V = LDW(&(W[TWVL * 8]));
|
||||
T1W = VZMULI(T1V, VFMAI(T1H, T1G));
|
||||
{
|
||||
V T2b, T2p, T28, T2o, T26, T1X, T2v, T2h, T2n, T1f, T1B, T1c, T1A, T1a, TR;
|
||||
V T1J, T1r, T1z, T1m, T1O, T1p, T1P, T1k, T1n, T1j, T2z, T1N, T2l, TC, T1w;
|
||||
V TP, T1x, Tm, TL, T1, T2t, T1v, T2f;
|
||||
T2b = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2a, T29));
|
||||
T2p = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T29, T2a));
|
||||
T26 = VFNMS(LDK(KP250000000), T25, T1Y);
|
||||
T28 = VFMA(LDK(KP559016994), T27, T26);
|
||||
T2o = VFNMS(LDK(KP559016994), T27, T26);
|
||||
T1X = LDW(&(W[TWVL * 6]));
|
||||
T2c = VZMUL(T1X, VFNMSI(T2b, T28));
|
||||
T2v = LDW(&(W[TWVL * 22]));
|
||||
T2w = VZMUL(T2v, VFNMSI(T2p, T2o));
|
||||
T2h = LDW(&(W[TWVL * 30]));
|
||||
T2i = VZMUL(T2h, VFMAI(T2b, T28));
|
||||
T2n = LDW(&(W[TWVL * 14]));
|
||||
T2q = VZMUL(T2n, VFMAI(T2p, T2o));
|
||||
T1f = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1e, T1d));
|
||||
T1B = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1d, T1e));
|
||||
T1a = VFNMS(LDK(KP250000000), T19, TU);
|
||||
T1c = VFNMS(LDK(KP559016994), T1b, T1a);
|
||||
T1A = VFMA(LDK(KP559016994), T1b, T1a);
|
||||
TR = LDW(&(W[TWVL * 2]));
|
||||
T1g = VZMUL(TR, VFNMSI(T1f, T1c));
|
||||
T1J = LDW(&(W[TWVL * 26]));
|
||||
T1K = VZMUL(T1J, VFNMSI(T1B, T1A));
|
||||
T1r = LDW(&(W[TWVL * 34]));
|
||||
T1s = VZMUL(T1r, VFMAI(T1f, T1c));
|
||||
T1z = LDW(&(W[TWVL * 10]));
|
||||
T1C = VZMUL(T1z, VFMAI(T1B, T1A));
|
||||
T1k = VFMA(LDK(KP559016994), Tl, Tk);
|
||||
T1m = VFNMS(LDK(KP951056516), T1l, T1k);
|
||||
T1O = VFMA(LDK(KP951056516), T1l, T1k);
|
||||
T1n = VFMA(LDK(KP559016994), TK, TJ);
|
||||
T1p = VFMA(LDK(KP951056516), T1o, T1n);
|
||||
T1P = VFNMS(LDK(KP951056516), T1o, T1n);
|
||||
T1j = LDW(&(W[TWVL * 36]));
|
||||
T1q = VZMULI(T1j, VFNMSI(T1p, T1m));
|
||||
T2z = LDW(&(W[0]));
|
||||
T2A = VZMULI(T2z, VFMAI(T1p, T1m));
|
||||
T1N = LDW(&(W[TWVL * 20]));
|
||||
T1Q = VZMULI(T1N, VFNMSI(T1P, T1O));
|
||||
T2l = LDW(&(W[TWVL * 16]));
|
||||
T2m = VZMULI(T2l, VFMAI(T1P, T1O));
|
||||
Tm = VFNMS(LDK(KP559016994), Tl, Tk);
|
||||
TC = VFMA(LDK(KP951056516), TB, Tm);
|
||||
T1w = VFNMS(LDK(KP951056516), TB, Tm);
|
||||
TL = VFNMS(LDK(KP559016994), TK, TJ);
|
||||
TP = VFNMS(LDK(KP951056516), TO, TL);
|
||||
T1x = VFMA(LDK(KP951056516), TO, TL);
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
TQ = VZMULI(T1, VFNMSI(TP, TC));
|
||||
T2t = LDW(&(W[TWVL * 24]));
|
||||
T2u = VZMULI(T2t, VFMAI(T1x, T1w));
|
||||
T1v = LDW(&(W[TWVL * 12]));
|
||||
T1y = VZMULI(T1v, VFNMSI(T1x, T1w));
|
||||
T2f = LDW(&(W[TWVL * 32]));
|
||||
T2g = VZMULI(T2f, VFMAI(TP, TC));
|
||||
}
|
||||
T1h = VADD(TQ, T1g);
|
||||
ST(&(Rp[WS(rs, 1)]), T1h, ms, &(Rp[WS(rs, 1)]));
|
||||
T1i = VCONJ(VSUB(T1g, TQ));
|
||||
ST(&(Rm[WS(rs, 1)]), T1i, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2s = VCONJ(VSUB(T2q, T2m));
|
||||
ST(&(Rm[WS(rs, 4)]), T2s, -ms, &(Rm[0]));
|
||||
T2D = VCONJ(VSUB(T2B, T2A));
|
||||
ST(&(Rm[0]), T2D, -ms, &(Rm[0]));
|
||||
T1D = VADD(T1y, T1C);
|
||||
ST(&(Rp[WS(rs, 3)]), T1D, ms, &(Rp[WS(rs, 1)]));
|
||||
T2x = VADD(T2u, T2w);
|
||||
ST(&(Rp[WS(rs, 6)]), T2x, ms, &(Rp[0]));
|
||||
T2y = VCONJ(VSUB(T2w, T2u));
|
||||
ST(&(Rm[WS(rs, 6)]), T2y, -ms, &(Rm[0]));
|
||||
T2C = VADD(T2A, T2B);
|
||||
ST(&(Rp[0]), T2C, ms, &(Rp[0]));
|
||||
T1u = VCONJ(VSUB(T1s, T1q));
|
||||
ST(&(Rm[WS(rs, 9)]), T1u, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1t = VADD(T1q, T1s);
|
||||
ST(&(Rp[WS(rs, 9)]), T1t, ms, &(Rp[WS(rs, 1)]));
|
||||
T1E = VCONJ(VSUB(T1C, T1y));
|
||||
ST(&(Rm[WS(rs, 3)]), T1E, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1L = VADD(T1I, T1K);
|
||||
ST(&(Rp[WS(rs, 7)]), T1L, ms, &(Rp[WS(rs, 1)]));
|
||||
T2d = VADD(T1W, T2c);
|
||||
ST(&(Rp[WS(rs, 2)]), T2d, ms, &(Rp[0]));
|
||||
T2r = VADD(T2m, T2q);
|
||||
ST(&(Rp[WS(rs, 4)]), T2r, ms, &(Rp[0]));
|
||||
T1U = VCONJ(VSUB(T1S, T1Q));
|
||||
ST(&(Rm[WS(rs, 5)]), T1U, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2e = VCONJ(VSUB(T2c, T1W));
|
||||
ST(&(Rm[WS(rs, 2)]), T2e, -ms, &(Rm[0]));
|
||||
T2j = VADD(T2g, T2i);
|
||||
ST(&(Rp[WS(rs, 8)]), T2j, ms, &(Rp[0]));
|
||||
T2k = VCONJ(VSUB(T2i, T2g));
|
||||
ST(&(Rm[WS(rs, 8)]), T2k, -ms, &(Rm[0]));
|
||||
T1T = VADD(T1Q, T1S);
|
||||
ST(&(Rp[WS(rs, 5)]), T1T, ms, &(Rp[WS(rs, 1)]));
|
||||
T1M = VCONJ(VSUB(T1K, T1I));
|
||||
ST(&(Rm[WS(rs, 7)]), T1M, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, { 77, 42, 66, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 143 FP additions, 62 FP multiplications,
|
||||
* (or, 131 additions, 50 multiplications, 12 fused multiply/add),
|
||||
* 114 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
|
||||
V TK, T1v, TY, T1x, T1j, T2f, TS, TT, TO, TU, T5, To, Tp, Tq, T2a;
|
||||
V T2d, T2g, T2k, T2j, T1k, T1l, T18, T1m, T1f;
|
||||
{
|
||||
V T2, TP, T4, TR, TI, T1d, T9, T12, Td, T15, TE, T1a, Tv, T13, Tm;
|
||||
V T1c, Tz, T16, Ti, T19, T3, TQ, TH, TG, TF, T6, T8, T7, Tc, Tb;
|
||||
V Ta, TD, TC, TB, Ts, Tu, Tt, Tl, Tk, Tj, Tw, Ty, Tx, Tf, Th;
|
||||
V Tg, TA, TJ, TW, TX, T1h, T1i, TM, TN, Te, Tn, T28, T29, T2b, T2c;
|
||||
V T14, T17, T1b, T1e;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
TP = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VCONJ(T3);
|
||||
TQ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
TR = VCONJ(TQ);
|
||||
TH = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
TG = VCONJ(TF);
|
||||
TI = VSUB(TG, TH);
|
||||
T1d = VADD(TG, TH);
|
||||
T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T8 = VCONJ(T7);
|
||||
T9 = VSUB(T6, T8);
|
||||
T12 = VADD(T6, T8);
|
||||
Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VCONJ(Ta);
|
||||
Td = VSUB(Tb, Tc);
|
||||
T15 = VADD(Tb, Tc);
|
||||
TD = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TB = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
TC = VCONJ(TB);
|
||||
TE = VSUB(TC, TD);
|
||||
T1a = VADD(TC, TD);
|
||||
Ts = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tt = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Tu = VCONJ(Tt);
|
||||
Tv = VSUB(Ts, Tu);
|
||||
T13 = VADD(Ts, Tu);
|
||||
Tl = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Tj = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tk = VCONJ(Tj);
|
||||
Tm = VSUB(Tk, Tl);
|
||||
T1c = VADD(Tk, Tl);
|
||||
Tw = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tx = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
Ty = VCONJ(Tx);
|
||||
Tz = VSUB(Tw, Ty);
|
||||
T16 = VADD(Tw, Ty);
|
||||
Tf = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Th = VCONJ(Tg);
|
||||
Ti = VSUB(Tf, Th);
|
||||
T19 = VADD(Tf, Th);
|
||||
TA = VSUB(Tv, Tz);
|
||||
TJ = VSUB(TE, TI);
|
||||
TK = VFNMS(LDK(KP951056516), TJ, VMUL(LDK(KP587785252), TA));
|
||||
T1v = VFMA(LDK(KP951056516), TA, VMUL(LDK(KP587785252), TJ));
|
||||
TW = VSUB(T9, Td);
|
||||
TX = VSUB(Ti, Tm);
|
||||
TY = VFNMS(LDK(KP951056516), TX, VMUL(LDK(KP587785252), TW));
|
||||
T1x = VFMA(LDK(KP951056516), TW, VMUL(LDK(KP587785252), TX));
|
||||
T1h = VADD(T2, T4);
|
||||
T1i = VADD(TP, TR);
|
||||
T1j = VSUB(T1h, T1i);
|
||||
T2f = VADD(T1h, T1i);
|
||||
TS = VSUB(TP, TR);
|
||||
TM = VADD(Tv, Tz);
|
||||
TN = VADD(TE, TI);
|
||||
TT = VADD(TM, TN);
|
||||
TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
|
||||
TU = VFNMS(LDK(KP250000000), TT, TS);
|
||||
T5 = VSUB(T2, T4);
|
||||
Te = VADD(T9, Td);
|
||||
Tn = VADD(Ti, Tm);
|
||||
To = VADD(Te, Tn);
|
||||
Tp = VFNMS(LDK(KP250000000), To, T5);
|
||||
Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
|
||||
T28 = VADD(T12, T13);
|
||||
T29 = VADD(T15, T16);
|
||||
T2a = VADD(T28, T29);
|
||||
T2b = VADD(T19, T1a);
|
||||
T2c = VADD(T1c, T1d);
|
||||
T2d = VADD(T2b, T2c);
|
||||
T2g = VADD(T2a, T2d);
|
||||
T2k = VSUB(T2b, T2c);
|
||||
T2j = VSUB(T28, T29);
|
||||
T14 = VSUB(T12, T13);
|
||||
T17 = VSUB(T15, T16);
|
||||
T1k = VADD(T14, T17);
|
||||
T1b = VSUB(T19, T1a);
|
||||
T1e = VSUB(T1c, T1d);
|
||||
T1l = VADD(T1b, T1e);
|
||||
T18 = VSUB(T14, T17);
|
||||
T1m = VADD(T1k, T1l);
|
||||
T1f = VSUB(T1b, T1e);
|
||||
}
|
||||
{
|
||||
V T2L, T22, T1S, T26, T2m, T2G, T2s, T2A, T1q, T1U, T1C, T1M, T10, T2E, T1I;
|
||||
V T2q, T1A, T2K, T20, T2w, T21, T1Q, T1R, T1P, T25, T1r, T1s, T2C, T2N, T1N;
|
||||
V T2H, T2I, T2M, T1E, T1D, T1O, T1V, T2n, T2B, T24, T2o, T2t, T2u, T23, T1W;
|
||||
T2L = VADD(T2f, T2g);
|
||||
T21 = LDW(&(W[TWVL * 18]));
|
||||
T22 = VZMUL(T21, VADD(T1j, T1m));
|
||||
T1Q = VADD(T5, To);
|
||||
T1R = VBYI(VADD(TS, TT));
|
||||
T1P = LDW(&(W[TWVL * 28]));
|
||||
T1S = VZMULI(T1P, VSUB(T1Q, T1R));
|
||||
T25 = LDW(&(W[TWVL * 8]));
|
||||
T26 = VZMULI(T25, VADD(T1Q, T1R));
|
||||
{
|
||||
V T2l, T2z, T2i, T2y, T2e, T2h, T27, T2F, T2r, T2x, T1g, T1K, T1p, T1L, T1n;
|
||||
V T1o, T11, T1T, T1B, T1J, TL, T1G, TZ, T1H, Tr, TV, T1, T2D, T1F, T2p;
|
||||
V T1w, T1Y, T1z, T1Z, T1u, T1y, T1t, T2J, T1X, T2v;
|
||||
T2l = VBYI(VFMA(LDK(KP951056516), T2j, VMUL(LDK(KP587785252), T2k)));
|
||||
T2z = VBYI(VFNMS(LDK(KP951056516), T2k, VMUL(LDK(KP587785252), T2j)));
|
||||
T2e = VMUL(LDK(KP559016994), VSUB(T2a, T2d));
|
||||
T2h = VFNMS(LDK(KP250000000), T2g, T2f);
|
||||
T2i = VADD(T2e, T2h);
|
||||
T2y = VSUB(T2h, T2e);
|
||||
T27 = LDW(&(W[TWVL * 6]));
|
||||
T2m = VZMUL(T27, VSUB(T2i, T2l));
|
||||
T2F = LDW(&(W[TWVL * 22]));
|
||||
T2G = VZMUL(T2F, VADD(T2z, T2y));
|
||||
T2r = LDW(&(W[TWVL * 30]));
|
||||
T2s = VZMUL(T2r, VADD(T2l, T2i));
|
||||
T2x = LDW(&(W[TWVL * 14]));
|
||||
T2A = VZMUL(T2x, VSUB(T2y, T2z));
|
||||
T1g = VBYI(VFNMS(LDK(KP951056516), T1f, VMUL(LDK(KP587785252), T18)));
|
||||
T1K = VBYI(VFMA(LDK(KP951056516), T18, VMUL(LDK(KP587785252), T1f)));
|
||||
T1n = VFNMS(LDK(KP250000000), T1m, T1j);
|
||||
T1o = VMUL(LDK(KP559016994), VSUB(T1k, T1l));
|
||||
T1p = VSUB(T1n, T1o);
|
||||
T1L = VADD(T1o, T1n);
|
||||
T11 = LDW(&(W[TWVL * 2]));
|
||||
T1q = VZMUL(T11, VADD(T1g, T1p));
|
||||
T1T = LDW(&(W[TWVL * 26]));
|
||||
T1U = VZMUL(T1T, VSUB(T1L, T1K));
|
||||
T1B = LDW(&(W[TWVL * 34]));
|
||||
T1C = VZMUL(T1B, VSUB(T1p, T1g));
|
||||
T1J = LDW(&(W[TWVL * 10]));
|
||||
T1M = VZMUL(T1J, VADD(T1K, T1L));
|
||||
Tr = VSUB(Tp, Tq);
|
||||
TL = VSUB(Tr, TK);
|
||||
T1G = VADD(Tr, TK);
|
||||
TV = VSUB(TO, TU);
|
||||
TZ = VBYI(VSUB(TV, TY));
|
||||
T1H = VBYI(VADD(TY, TV));
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
T10 = VZMULI(T1, VADD(TL, TZ));
|
||||
T2D = LDW(&(W[TWVL * 24]));
|
||||
T2E = VZMULI(T2D, VSUB(T1G, T1H));
|
||||
T1F = LDW(&(W[TWVL * 12]));
|
||||
T1I = VZMULI(T1F, VADD(T1G, T1H));
|
||||
T2p = LDW(&(W[TWVL * 32]));
|
||||
T2q = VZMULI(T2p, VSUB(TL, TZ));
|
||||
T1u = VADD(Tq, Tp);
|
||||
T1w = VSUB(T1u, T1v);
|
||||
T1Y = VADD(T1u, T1v);
|
||||
T1y = VADD(TO, TU);
|
||||
T1z = VBYI(VADD(T1x, T1y));
|
||||
T1Z = VBYI(VSUB(T1y, T1x));
|
||||
T1t = LDW(&(W[TWVL * 36]));
|
||||
T1A = VZMULI(T1t, VSUB(T1w, T1z));
|
||||
T2J = LDW(&(W[0]));
|
||||
T2K = VZMULI(T2J, VADD(T1w, T1z));
|
||||
T1X = LDW(&(W[TWVL * 20]));
|
||||
T20 = VZMULI(T1X, VSUB(T1Y, T1Z));
|
||||
T2v = LDW(&(W[TWVL * 16]));
|
||||
T2w = VZMULI(T2v, VADD(T1Y, T1Z));
|
||||
}
|
||||
T1r = VADD(T10, T1q);
|
||||
ST(&(Rp[WS(rs, 1)]), T1r, ms, &(Rp[WS(rs, 1)]));
|
||||
T1s = VCONJ(VSUB(T1q, T10));
|
||||
ST(&(Rm[WS(rs, 1)]), T1s, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2C = VCONJ(VSUB(T2A, T2w));
|
||||
ST(&(Rm[WS(rs, 4)]), T2C, -ms, &(Rm[0]));
|
||||
T2N = VCONJ(VSUB(T2L, T2K));
|
||||
ST(&(Rm[0]), T2N, -ms, &(Rm[0]));
|
||||
T1N = VADD(T1I, T1M);
|
||||
ST(&(Rp[WS(rs, 3)]), T1N, ms, &(Rp[WS(rs, 1)]));
|
||||
T2H = VADD(T2E, T2G);
|
||||
ST(&(Rp[WS(rs, 6)]), T2H, ms, &(Rp[0]));
|
||||
T2I = VCONJ(VSUB(T2G, T2E));
|
||||
ST(&(Rm[WS(rs, 6)]), T2I, -ms, &(Rm[0]));
|
||||
T2M = VADD(T2K, T2L);
|
||||
ST(&(Rp[0]), T2M, ms, &(Rp[0]));
|
||||
T1E = VCONJ(VSUB(T1C, T1A));
|
||||
ST(&(Rm[WS(rs, 9)]), T1E, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1D = VADD(T1A, T1C);
|
||||
ST(&(Rp[WS(rs, 9)]), T1D, ms, &(Rp[WS(rs, 1)]));
|
||||
T1O = VCONJ(VSUB(T1M, T1I));
|
||||
ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1V = VADD(T1S, T1U);
|
||||
ST(&(Rp[WS(rs, 7)]), T1V, ms, &(Rp[WS(rs, 1)]));
|
||||
T2n = VADD(T26, T2m);
|
||||
ST(&(Rp[WS(rs, 2)]), T2n, ms, &(Rp[0]));
|
||||
T2B = VADD(T2w, T2A);
|
||||
ST(&(Rp[WS(rs, 4)]), T2B, ms, &(Rp[0]));
|
||||
T24 = VCONJ(VSUB(T22, T20));
|
||||
ST(&(Rm[WS(rs, 5)]), T24, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2o = VCONJ(VSUB(T2m, T26));
|
||||
ST(&(Rm[WS(rs, 2)]), T2o, -ms, &(Rm[0]));
|
||||
T2t = VADD(T2q, T2s);
|
||||
ST(&(Rp[WS(rs, 8)]), T2t, ms, &(Rp[0]));
|
||||
T2u = VCONJ(VSUB(T2s, T2q));
|
||||
ST(&(Rm[WS(rs, 8)]), T2u, -ms, &(Rm[0]));
|
||||
T23 = VADD(T20, T22);
|
||||
ST(&(Rp[WS(rs, 5)]), T23, ms, &(Rp[WS(rs, 1)]));
|
||||
T1W = VCONJ(VSUB(T1U, T1S));
|
||||
ST(&(Rm[WS(rs, 7)]), T1W, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, { 131, 50, 12, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
872
fftw-3.3.10/rdft/simd/common/hc2cbdftv_32.c
Normal file
872
fftw-3.3.10/rdft/simd/common/hc2cbdftv_32.c
Normal file
@@ -0,0 +1,872 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 249 FP additions, 192 FP multiplications,
|
||||
* (or, 119 additions, 62 multiplications, 130 fused multiply/add),
|
||||
* 143 stack variables, 7 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
|
||||
DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
|
||||
V Ts, T1S, T3p, T45, T3A, T48, T1b, T1V, T1o, T2G, T2o, T2Y, T2z, T31, T1L;
|
||||
V T2H, T2J, T2K, TJ, T1c, T3D, T46, T10, T1d, T2r, T2A, T3w, T49, T1D, T1M;
|
||||
V T2u, T2B;
|
||||
{
|
||||
V T4, T1i, T15, T1j, Tb, T1m, T16, T1l, T1G, T1F, Tj, T3m, T18, T1J, T1I;
|
||||
V Tq, T3n, T19, T2, T3, T13, T14, T5, T6, T7, T8, T9, Ta, Tf, Ti;
|
||||
V Td, Te, Tg, Th, Tm, Tp, Tk, Tl, Tn, To, Tc, Tr, T3l, T3o, T3y;
|
||||
V T3z, T17, T1a, T1k, T1n, T2m, T2n, T2x, T2y, T1H, T1K;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VFNMSCONJ(T3, T2);
|
||||
T1i = VFMACONJ(T3, T2);
|
||||
T13 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
T14 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T15 = VFNMSCONJ(T14, T13);
|
||||
T1j = VFMACONJ(T14, T13);
|
||||
T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T7 = VFNMSCONJ(T6, T5);
|
||||
T8 = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
|
||||
T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ta = VFMSCONJ(T9, T8);
|
||||
Tb = VADD(T7, Ta);
|
||||
T1m = VFMACONJ(T9, T8);
|
||||
T16 = VSUB(T7, Ta);
|
||||
T1l = VFMACONJ(T6, T5);
|
||||
Td = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
|
||||
Te = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tf = VFNMSCONJ(Te, Td);
|
||||
T1G = VFMACONJ(Te, Td);
|
||||
Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = VFNMSCONJ(Th, Tg);
|
||||
T1F = VFMACONJ(Th, Tg);
|
||||
Tj = VFMA(LDK(KP414213562), Ti, Tf);
|
||||
T3m = VSUB(T1F, T1G);
|
||||
T18 = VFNMS(LDK(KP414213562), Tf, Ti);
|
||||
Tk = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
Tl = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tm = VFNMSCONJ(Tl, Tk);
|
||||
T1J = VFMACONJ(Tl, Tk);
|
||||
Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
|
||||
To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tp = VFMSCONJ(To, Tn);
|
||||
T1I = VFMACONJ(To, Tn);
|
||||
Tq = VFNMS(LDK(KP414213562), Tp, Tm);
|
||||
T3n = VSUB(T1I, T1J);
|
||||
T19 = VFMA(LDK(KP414213562), Tm, Tp);
|
||||
Tc = VFNMS(LDK(KP707106781), Tb, T4);
|
||||
Tr = VSUB(Tj, Tq);
|
||||
Ts = VFMA(LDK(KP923879532), Tr, Tc);
|
||||
T1S = VFNMS(LDK(KP923879532), Tr, Tc);
|
||||
T3l = VSUB(T1i, T1j);
|
||||
T3o = VADD(T3m, T3n);
|
||||
T3p = VFMA(LDK(KP707106781), T3o, T3l);
|
||||
T45 = VFNMS(LDK(KP707106781), T3o, T3l);
|
||||
T3y = VSUB(T1l, T1m);
|
||||
T3z = VSUB(T3m, T3n);
|
||||
T3A = VFMA(LDK(KP707106781), T3z, T3y);
|
||||
T48 = VFNMS(LDK(KP707106781), T3z, T3y);
|
||||
T17 = VFNMS(LDK(KP707106781), T16, T15);
|
||||
T1a = VSUB(T18, T19);
|
||||
T1b = VFNMS(LDK(KP923879532), T1a, T17);
|
||||
T1V = VFMA(LDK(KP923879532), T1a, T17);
|
||||
T1k = VADD(T1i, T1j);
|
||||
T1n = VADD(T1l, T1m);
|
||||
T1o = VSUB(T1k, T1n);
|
||||
T2G = VADD(T1k, T1n);
|
||||
T2m = VFMA(LDK(KP707106781), Tb, T4);
|
||||
T2n = VADD(T18, T19);
|
||||
T2o = VFNMS(LDK(KP923879532), T2n, T2m);
|
||||
T2Y = VFMA(LDK(KP923879532), T2n, T2m);
|
||||
T2x = VFMA(LDK(KP707106781), T16, T15);
|
||||
T2y = VADD(Tj, Tq);
|
||||
T2z = VFNMS(LDK(KP923879532), T2y, T2x);
|
||||
T31 = VFMA(LDK(KP923879532), T2y, T2x);
|
||||
T1H = VADD(T1F, T1G);
|
||||
T1K = VADD(T1I, T1J);
|
||||
T1L = VSUB(T1H, T1K);
|
||||
T2H = VADD(T1H, T1K);
|
||||
}
|
||||
{
|
||||
V Tv, T3q, TG, T1r, TM, T3t, TX, T1y, TC, T3r, TH, T1u, TT, T3u, TY;
|
||||
V T1B, Tt, Tu, T1p, TE, TF, T1q, TK, TL, T1w, TV, TW, T1x, Ty, T1s;
|
||||
V TB, T1t, Tw, Tx, Tz, TA, TP, T1z, TS, T1A, TN, TO, TQ, TR, TD;
|
||||
V TI, T3B, T3C, TU, TZ, T2p, T2q, T3s, T3v, T1v, T1C, T2s, T2t;
|
||||
Tt = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tu = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
|
||||
T1p = VFMACONJ(Tu, Tt);
|
||||
TE = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TF = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
T1q = VFMACONJ(TF, TE);
|
||||
Tv = VFNMSCONJ(Tu, Tt);
|
||||
T3q = VSUB(T1p, T1q);
|
||||
TG = VFNMSCONJ(TF, TE);
|
||||
T1r = VADD(T1p, T1q);
|
||||
TK = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TL = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T1w = VFMACONJ(TL, TK);
|
||||
TV = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TW = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
T1x = VFMACONJ(TW, TV);
|
||||
TM = VFMSCONJ(TL, TK);
|
||||
T3t = VSUB(T1w, T1x);
|
||||
TX = VFNMSCONJ(TW, TV);
|
||||
T1y = VADD(T1w, T1x);
|
||||
Tw = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tx = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
|
||||
Ty = VFNMSCONJ(Tx, Tw);
|
||||
T1s = VFMACONJ(Tx, Tw);
|
||||
Tz = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TA = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
TB = VFMSCONJ(TA, Tz);
|
||||
T1t = VFMACONJ(TA, Tz);
|
||||
TC = VADD(Ty, TB);
|
||||
T3r = VSUB(T1s, T1t);
|
||||
TH = VSUB(Ty, TB);
|
||||
T1u = VADD(T1s, T1t);
|
||||
TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TO = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
|
||||
TP = VFNMSCONJ(TO, TN);
|
||||
T1z = VFMACONJ(TO, TN);
|
||||
TQ = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TR = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
TS = VFMSCONJ(TR, TQ);
|
||||
T1A = VFMACONJ(TR, TQ);
|
||||
TT = VADD(TP, TS);
|
||||
T3u = VSUB(T1A, T1z);
|
||||
TY = VSUB(TS, TP);
|
||||
T1B = VADD(T1z, T1A);
|
||||
T2J = VADD(T1r, T1u);
|
||||
T2K = VADD(T1y, T1B);
|
||||
TD = VFNMS(LDK(KP707106781), TC, Tv);
|
||||
TI = VFNMS(LDK(KP707106781), TH, TG);
|
||||
TJ = VFMA(LDK(KP668178637), TI, TD);
|
||||
T1c = VFNMS(LDK(KP668178637), TD, TI);
|
||||
T3B = VFMA(LDK(KP414213562), T3q, T3r);
|
||||
T3C = VFMA(LDK(KP414213562), T3t, T3u);
|
||||
T3D = VSUB(T3B, T3C);
|
||||
T46 = VADD(T3B, T3C);
|
||||
TU = VFNMS(LDK(KP707106781), TT, TM);
|
||||
TZ = VFMA(LDK(KP707106781), TY, TX);
|
||||
T10 = VFNMS(LDK(KP668178637), TZ, TU);
|
||||
T1d = VFMA(LDK(KP668178637), TU, TZ);
|
||||
T2p = VFMA(LDK(KP707106781), TH, TG);
|
||||
T2q = VFMA(LDK(KP707106781), TC, Tv);
|
||||
T2r = VFMA(LDK(KP198912367), T2q, T2p);
|
||||
T2A = VFNMS(LDK(KP198912367), T2p, T2q);
|
||||
T3s = VFNMS(LDK(KP414213562), T3r, T3q);
|
||||
T3v = VFNMS(LDK(KP414213562), T3u, T3t);
|
||||
T3w = VADD(T3s, T3v);
|
||||
T49 = VSUB(T3s, T3v);
|
||||
T1v = VSUB(T1r, T1u);
|
||||
T1C = VSUB(T1y, T1B);
|
||||
T1D = VADD(T1v, T1C);
|
||||
T1M = VSUB(T1v, T1C);
|
||||
T2s = VFNMS(LDK(KP707106781), TY, TX);
|
||||
T2t = VFMA(LDK(KP707106781), TT, TM);
|
||||
T2u = VFNMS(LDK(KP198912367), T2t, T2s);
|
||||
T2B = VFMA(LDK(KP198912367), T2s, T2t);
|
||||
}
|
||||
{
|
||||
V T3f, T38, T4p, T4v, T3T, T3Z, T2a, T2i, T4b, T4h, T1O, T20, T2M, T2U, T3F;
|
||||
V T3L, T1g, T3X, T2g, T3J, T2E, T4l, T2S, T4f, T1Y, T4t, T26, T43, T34, T3P;
|
||||
V T3e, T3j, T36, T37, T35, T4n, T4o, T4m, T4u, T3R, T3S, T3Q, T3Y, T28, T29;
|
||||
V T27, T2h, T47, T4a, T44, T4g, T1E, T1N, T1h, T1Z;
|
||||
T36 = VADD(T2G, T2H);
|
||||
T37 = VADD(T2J, T2K);
|
||||
T3f = VADD(T36, T37);
|
||||
T35 = LDW(&(W[TWVL * 30]));
|
||||
T38 = VZMUL(T35, VSUB(T36, T37));
|
||||
T4n = VFMA(LDK(KP923879532), T46, T45);
|
||||
T4o = VFNMS(LDK(KP923879532), T49, T48);
|
||||
T4m = LDW(&(W[TWVL * 10]));
|
||||
T4p = VZMUL(T4m, VFNMSI(T4o, T4n));
|
||||
T4u = LDW(&(W[TWVL * 50]));
|
||||
T4v = VZMUL(T4u, VFMAI(T4o, T4n));
|
||||
T3R = VFMA(LDK(KP923879532), T3w, T3p);
|
||||
T3S = VFMA(LDK(KP923879532), T3D, T3A);
|
||||
T3Q = LDW(&(W[TWVL * 58]));
|
||||
T3T = VZMUL(T3Q, VFNMSI(T3S, T3R));
|
||||
T3Y = LDW(&(W[TWVL * 2]));
|
||||
T3Z = VZMUL(T3Y, VFMAI(T3S, T3R));
|
||||
T28 = VFMA(LDK(KP707106781), T1D, T1o);
|
||||
T29 = VFMA(LDK(KP707106781), T1M, T1L);
|
||||
T27 = LDW(&(W[TWVL * 6]));
|
||||
T2a = VZMUL(T27, VFMAI(T29, T28));
|
||||
T2h = LDW(&(W[TWVL * 54]));
|
||||
T2i = VZMUL(T2h, VFNMSI(T29, T28));
|
||||
T47 = VFNMS(LDK(KP923879532), T46, T45);
|
||||
T4a = VFMA(LDK(KP923879532), T49, T48);
|
||||
T44 = LDW(&(W[TWVL * 18]));
|
||||
T4b = VZMUL(T44, VFMAI(T4a, T47));
|
||||
T4g = LDW(&(W[TWVL * 42]));
|
||||
T4h = VZMUL(T4g, VFNMSI(T4a, T47));
|
||||
T1E = VFNMS(LDK(KP707106781), T1D, T1o);
|
||||
T1N = VFNMS(LDK(KP707106781), T1M, T1L);
|
||||
T1h = LDW(&(W[TWVL * 22]));
|
||||
T1O = VZMUL(T1h, VFNMSI(T1N, T1E));
|
||||
T1Z = LDW(&(W[TWVL * 38]));
|
||||
T20 = VZMUL(T1Z, VFMAI(T1N, T1E));
|
||||
{
|
||||
V T2I, T2L, T2F, T2T, T3x, T3E, T3k, T3K, T12, T2e, T1f, T2f, T11, T1e, T1;
|
||||
V T3W, T2d, T3I, T2w, T2Q, T2D, T2R, T2v, T2C, T2l, T4k, T2P, T4e, T1U, T24;
|
||||
V T1X, T25, T1T, T1W, T1R, T4s, T23, T42, T30, T3c, T33, T3d, T2Z, T32, T2X;
|
||||
V T3O, T3b, T3i;
|
||||
T2I = VSUB(T2G, T2H);
|
||||
T2L = VSUB(T2J, T2K);
|
||||
T2F = LDW(&(W[TWVL * 46]));
|
||||
T2M = VZMUL(T2F, VFNMSI(T2L, T2I));
|
||||
T2T = LDW(&(W[TWVL * 14]));
|
||||
T2U = VZMUL(T2T, VFMAI(T2L, T2I));
|
||||
T3x = VFNMS(LDK(KP923879532), T3w, T3p);
|
||||
T3E = VFNMS(LDK(KP923879532), T3D, T3A);
|
||||
T3k = LDW(&(W[TWVL * 26]));
|
||||
T3F = VZMUL(T3k, VFNMSI(T3E, T3x));
|
||||
T3K = LDW(&(W[TWVL * 34]));
|
||||
T3L = VZMUL(T3K, VFMAI(T3E, T3x));
|
||||
T11 = VADD(TJ, T10);
|
||||
T12 = VFNMS(LDK(KP831469612), T11, Ts);
|
||||
T2e = VFMA(LDK(KP831469612), T11, Ts);
|
||||
T1e = VADD(T1c, T1d);
|
||||
T1f = VFNMS(LDK(KP831469612), T1e, T1b);
|
||||
T2f = VFMA(LDK(KP831469612), T1e, T1b);
|
||||
T1 = LDW(&(W[TWVL * 24]));
|
||||
T1g = VZMULI(T1, VFMAI(T1f, T12));
|
||||
T3W = LDW(&(W[TWVL * 4]));
|
||||
T3X = VZMULI(T3W, VFNMSI(T2f, T2e));
|
||||
T2d = LDW(&(W[TWVL * 56]));
|
||||
T2g = VZMULI(T2d, VFMAI(T2f, T2e));
|
||||
T3I = LDW(&(W[TWVL * 36]));
|
||||
T3J = VZMULI(T3I, VFNMSI(T1f, T12));
|
||||
T2v = VSUB(T2r, T2u);
|
||||
T2w = VFMA(LDK(KP980785280), T2v, T2o);
|
||||
T2Q = VFNMS(LDK(KP980785280), T2v, T2o);
|
||||
T2C = VSUB(T2A, T2B);
|
||||
T2D = VFNMS(LDK(KP980785280), T2C, T2z);
|
||||
T2R = VFMA(LDK(KP980785280), T2C, T2z);
|
||||
T2l = LDW(&(W[TWVL * 48]));
|
||||
T2E = VZMULI(T2l, VFMAI(T2D, T2w));
|
||||
T4k = LDW(&(W[TWVL * 12]));
|
||||
T4l = VZMULI(T4k, VFNMSI(T2D, T2w));
|
||||
T2P = LDW(&(W[TWVL * 16]));
|
||||
T2S = VZMULI(T2P, VFMAI(T2R, T2Q));
|
||||
T4e = LDW(&(W[TWVL * 44]));
|
||||
T4f = VZMULI(T4e, VFNMSI(T2R, T2Q));
|
||||
T1T = VSUB(T1d, T1c);
|
||||
T1U = VFNMS(LDK(KP831469612), T1T, T1S);
|
||||
T24 = VFMA(LDK(KP831469612), T1T, T1S);
|
||||
T1W = VSUB(TJ, T10);
|
||||
T1X = VFNMS(LDK(KP831469612), T1W, T1V);
|
||||
T25 = VFMA(LDK(KP831469612), T1W, T1V);
|
||||
T1R = LDW(&(W[TWVL * 40]));
|
||||
T1Y = VZMULI(T1R, VFMAI(T1X, T1U));
|
||||
T4s = LDW(&(W[TWVL * 52]));
|
||||
T4t = VZMULI(T4s, VFNMSI(T25, T24));
|
||||
T23 = LDW(&(W[TWVL * 8]));
|
||||
T26 = VZMULI(T23, VFMAI(T25, T24));
|
||||
T42 = LDW(&(W[TWVL * 20]));
|
||||
T43 = VZMULI(T42, VFNMSI(T1X, T1U));
|
||||
T2Z = VADD(T2A, T2B);
|
||||
T30 = VFNMS(LDK(KP980785280), T2Z, T2Y);
|
||||
T3c = VFMA(LDK(KP980785280), T2Z, T2Y);
|
||||
T32 = VADD(T2r, T2u);
|
||||
T33 = VFNMS(LDK(KP980785280), T32, T31);
|
||||
T3d = VFMA(LDK(KP980785280), T32, T31);
|
||||
T2X = LDW(&(W[TWVL * 32]));
|
||||
T34 = VZMULI(T2X, VFMAI(T33, T30));
|
||||
T3O = LDW(&(W[TWVL * 60]));
|
||||
T3P = VZMULI(T3O, VFNMSI(T3d, T3c));
|
||||
T3b = LDW(&(W[0]));
|
||||
T3e = VZMULI(T3b, VFMAI(T3d, T3c));
|
||||
T3i = LDW(&(W[TWVL * 28]));
|
||||
T3j = VZMULI(T3i, VFNMSI(T33, T30));
|
||||
}
|
||||
{
|
||||
V T1P, T4w, T2j, T4c, T4x, T1Q, T4d, T2k, T21, T4q, T2b, T4i, T4r, T22, T4j;
|
||||
V T2c, T2N, T40, T3g, T3G, T41, T2O, T3H, T3h, T2V, T3U, T39, T3M, T3V, T2W;
|
||||
V T3N, T3a;
|
||||
T1P = VADD(T1g, T1O);
|
||||
ST(&(Rp[WS(rs, 6)]), T1P, ms, &(Rp[0]));
|
||||
T4w = VADD(T4t, T4v);
|
||||
ST(&(Rp[WS(rs, 13)]), T4w, ms, &(Rp[WS(rs, 1)]));
|
||||
T2j = VADD(T2g, T2i);
|
||||
ST(&(Rp[WS(rs, 14)]), T2j, ms, &(Rp[0]));
|
||||
T4c = VADD(T43, T4b);
|
||||
ST(&(Rp[WS(rs, 5)]), T4c, ms, &(Rp[WS(rs, 1)]));
|
||||
T4x = VCONJ(VSUB(T4v, T4t));
|
||||
ST(&(Rm[WS(rs, 13)]), T4x, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1Q = VCONJ(VSUB(T1O, T1g));
|
||||
ST(&(Rm[WS(rs, 6)]), T1Q, -ms, &(Rm[0]));
|
||||
T4d = VCONJ(VSUB(T4b, T43));
|
||||
ST(&(Rm[WS(rs, 5)]), T4d, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2k = VCONJ(VSUB(T2i, T2g));
|
||||
ST(&(Rm[WS(rs, 14)]), T2k, -ms, &(Rm[0]));
|
||||
T21 = VADD(T1Y, T20);
|
||||
ST(&(Rp[WS(rs, 10)]), T21, ms, &(Rp[0]));
|
||||
T4q = VADD(T4l, T4p);
|
||||
ST(&(Rp[WS(rs, 3)]), T4q, ms, &(Rp[WS(rs, 1)]));
|
||||
T2b = VADD(T26, T2a);
|
||||
ST(&(Rp[WS(rs, 2)]), T2b, ms, &(Rp[0]));
|
||||
T4i = VADD(T4f, T4h);
|
||||
ST(&(Rp[WS(rs, 11)]), T4i, ms, &(Rp[WS(rs, 1)]));
|
||||
T4r = VCONJ(VSUB(T4p, T4l));
|
||||
ST(&(Rm[WS(rs, 3)]), T4r, -ms, &(Rm[WS(rs, 1)]));
|
||||
T22 = VCONJ(VSUB(T20, T1Y));
|
||||
ST(&(Rm[WS(rs, 10)]), T22, -ms, &(Rm[0]));
|
||||
T4j = VCONJ(VSUB(T4h, T4f));
|
||||
ST(&(Rm[WS(rs, 11)]), T4j, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2c = VCONJ(VSUB(T2a, T26));
|
||||
ST(&(Rm[WS(rs, 2)]), T2c, -ms, &(Rm[0]));
|
||||
T2N = VADD(T2E, T2M);
|
||||
ST(&(Rp[WS(rs, 12)]), T2N, ms, &(Rp[0]));
|
||||
T40 = VADD(T3X, T3Z);
|
||||
ST(&(Rp[WS(rs, 1)]), T40, ms, &(Rp[WS(rs, 1)]));
|
||||
T3g = VADD(T3e, T3f);
|
||||
ST(&(Rp[0]), T3g, ms, &(Rp[0]));
|
||||
T3G = VADD(T3j, T3F);
|
||||
ST(&(Rp[WS(rs, 7)]), T3G, ms, &(Rp[WS(rs, 1)]));
|
||||
T41 = VCONJ(VSUB(T3Z, T3X));
|
||||
ST(&(Rm[WS(rs, 1)]), T41, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2O = VCONJ(VSUB(T2M, T2E));
|
||||
ST(&(Rm[WS(rs, 12)]), T2O, -ms, &(Rm[0]));
|
||||
T3H = VCONJ(VSUB(T3F, T3j));
|
||||
ST(&(Rm[WS(rs, 7)]), T3H, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3h = VCONJ(VSUB(T3f, T3e));
|
||||
ST(&(Rm[0]), T3h, -ms, &(Rm[0]));
|
||||
T2V = VADD(T2S, T2U);
|
||||
ST(&(Rp[WS(rs, 4)]), T2V, ms, &(Rp[0]));
|
||||
T3U = VADD(T3P, T3T);
|
||||
ST(&(Rp[WS(rs, 15)]), T3U, ms, &(Rp[WS(rs, 1)]));
|
||||
T39 = VADD(T34, T38);
|
||||
ST(&(Rp[WS(rs, 8)]), T39, ms, &(Rp[0]));
|
||||
T3M = VADD(T3J, T3L);
|
||||
ST(&(Rp[WS(rs, 9)]), T3M, ms, &(Rp[WS(rs, 1)]));
|
||||
T3V = VCONJ(VSUB(T3T, T3P));
|
||||
ST(&(Rm[WS(rs, 15)]), T3V, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2W = VCONJ(VSUB(T2U, T2S));
|
||||
ST(&(Rm[WS(rs, 4)]), T2W, -ms, &(Rm[0]));
|
||||
T3N = VCONJ(VSUB(T3L, T3J));
|
||||
ST(&(Rm[WS(rs, 9)]), T3N, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3a = VCONJ(VSUB(T38, T34));
|
||||
ST(&(Rm[WS(rs, 8)]), T3a, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
VTW(1, 20),
|
||||
VTW(1, 21),
|
||||
VTW(1, 22),
|
||||
VTW(1, 23),
|
||||
VTW(1, 24),
|
||||
VTW(1, 25),
|
||||
VTW(1, 26),
|
||||
VTW(1, 27),
|
||||
VTW(1, 28),
|
||||
VTW(1, 29),
|
||||
VTW(1, 30),
|
||||
VTW(1, 31),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, { 119, 62, 130, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 249 FP additions, 104 FP multiplications,
|
||||
* (or, 233 additions, 88 multiplications, 16 fused multiply/add),
|
||||
* 161 stack variables, 7 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
|
||||
DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
|
||||
DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
|
||||
V T1W, T21, Tf, T2c, T1t, T2r, T3T, T4m, Ty, T2q, T3P, T4n, T1n, T2d, T1T;
|
||||
V T22, T1E, T24, T3I, T4p, TU, T2n, T1i, T2h, T1L, T25, T3L, T4q, T1f, T2o;
|
||||
V T1j, T2k;
|
||||
{
|
||||
V T2, T4, T1Z, T1p, T1r, T20, T9, T1U, Td, T1V, T3, T1q, T6, T8, T7;
|
||||
V Tc, Tb, Ta, T5, Te, T1o, T1s, T3R, T3S, Tj, T1N, Tw, T1Q, Tn, T1O;
|
||||
V Ts, T1R, Tg, Ti, Th, Tv, Tu, Tt, Tk, Tm, Tl, Tp, Tr, Tq, To;
|
||||
V Tx, T3N, T3O, T1l, T1m, T1P, T1S;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VCONJ(T3);
|
||||
T1Z = VADD(T2, T4);
|
||||
T1p = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
T1q = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T1r = VCONJ(T1q);
|
||||
T20 = VADD(T1p, T1r);
|
||||
T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T8 = VCONJ(T7);
|
||||
T9 = VSUB(T6, T8);
|
||||
T1U = VADD(T6, T8);
|
||||
Tc = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
|
||||
Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VCONJ(Ta);
|
||||
Td = VSUB(Tb, Tc);
|
||||
T1V = VADD(Tb, Tc);
|
||||
T1W = VSUB(T1U, T1V);
|
||||
T21 = VSUB(T1Z, T20);
|
||||
T5 = VSUB(T2, T4);
|
||||
Te = VMUL(LDK(KP707106781), VADD(T9, Td));
|
||||
Tf = VSUB(T5, Te);
|
||||
T2c = VADD(T5, Te);
|
||||
T1o = VMUL(LDK(KP707106781), VSUB(T9, Td));
|
||||
T1s = VSUB(T1p, T1r);
|
||||
T1t = VSUB(T1o, T1s);
|
||||
T2r = VADD(T1s, T1o);
|
||||
T3R = VADD(T1Z, T20);
|
||||
T3S = VADD(T1U, T1V);
|
||||
T3T = VSUB(T3R, T3S);
|
||||
T4m = VADD(T3R, T3S);
|
||||
Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = VCONJ(Th);
|
||||
Tj = VSUB(Tg, Ti);
|
||||
T1N = VADD(Tg, Ti);
|
||||
Tv = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
|
||||
Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tu = VCONJ(Tt);
|
||||
Tw = VSUB(Tu, Tv);
|
||||
T1Q = VADD(Tu, Tv);
|
||||
Tk = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
|
||||
Tl = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tm = VCONJ(Tl);
|
||||
Tn = VSUB(Tk, Tm);
|
||||
T1O = VADD(Tk, Tm);
|
||||
Tp = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
Tq = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tr = VCONJ(Tq);
|
||||
Ts = VSUB(Tp, Tr);
|
||||
T1R = VADD(Tp, Tr);
|
||||
To = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
|
||||
Tx = VFNMS(LDK(KP382683432), Tw, VMUL(LDK(KP923879532), Ts));
|
||||
Ty = VSUB(To, Tx);
|
||||
T2q = VADD(To, Tx);
|
||||
T3N = VADD(T1N, T1O);
|
||||
T3O = VADD(T1Q, T1R);
|
||||
T3P = VSUB(T3N, T3O);
|
||||
T4n = VADD(T3N, T3O);
|
||||
T1l = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
|
||||
T1m = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), Ts));
|
||||
T1n = VSUB(T1l, T1m);
|
||||
T2d = VADD(T1l, T1m);
|
||||
T1P = VSUB(T1N, T1O);
|
||||
T1S = VSUB(T1Q, T1R);
|
||||
T1T = VMUL(LDK(KP707106781), VSUB(T1P, T1S));
|
||||
T22 = VMUL(LDK(KP707106781), VADD(T1P, T1S));
|
||||
}
|
||||
{
|
||||
V TD, T1B, TR, T1y, TH, T1C, TM, T1z, TA, TC, TB, TO, TQ, TP, TG;
|
||||
V TF, TE, TJ, TL, TK, T1A, T1D, T3G, T3H, TN, T2f, TT, T2g, TI, TS;
|
||||
V TY, T1I, T1c, T1F, T12, T1J, T17, T1G, TV, TX, TW, T1b, T1a, T19, T11;
|
||||
V T10, TZ, T14, T16, T15, T1H, T1K, T3J, T3K, T18, T2i, T1e, T2j, T13, T1d;
|
||||
TA = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TB = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
|
||||
TC = VCONJ(TB);
|
||||
TD = VSUB(TA, TC);
|
||||
T1B = VADD(TA, TC);
|
||||
TO = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TP = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
|
||||
TQ = VCONJ(TP);
|
||||
TR = VSUB(TO, TQ);
|
||||
T1y = VADD(TO, TQ);
|
||||
TG = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
TF = VCONJ(TE);
|
||||
TH = VSUB(TF, TG);
|
||||
T1C = VADD(TF, TG);
|
||||
TJ = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
TL = VCONJ(TK);
|
||||
TM = VSUB(TJ, TL);
|
||||
T1z = VADD(TJ, TL);
|
||||
T1A = VSUB(T1y, T1z);
|
||||
T1D = VSUB(T1B, T1C);
|
||||
T1E = VFNMS(LDK(KP382683432), T1D, VMUL(LDK(KP923879532), T1A));
|
||||
T24 = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1D));
|
||||
T3G = VADD(T1y, T1z);
|
||||
T3H = VADD(T1B, T1C);
|
||||
T3I = VSUB(T3G, T3H);
|
||||
T4p = VADD(T3G, T3H);
|
||||
TI = VMUL(LDK(KP707106781), VSUB(TD, TH));
|
||||
TN = VSUB(TI, TM);
|
||||
T2f = VADD(TM, TI);
|
||||
TS = VMUL(LDK(KP707106781), VADD(TD, TH));
|
||||
TT = VSUB(TR, TS);
|
||||
T2g = VADD(TR, TS);
|
||||
TU = VFMA(LDK(KP831469612), TN, VMUL(LDK(KP555570233), TT));
|
||||
T2n = VFNMS(LDK(KP195090322), T2f, VMUL(LDK(KP980785280), T2g));
|
||||
T1i = VFNMS(LDK(KP555570233), TN, VMUL(LDK(KP831469612), TT));
|
||||
T2h = VFMA(LDK(KP980785280), T2f, VMUL(LDK(KP195090322), T2g));
|
||||
TV = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TW = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
|
||||
TX = VCONJ(TW);
|
||||
TY = VSUB(TV, TX);
|
||||
T1I = VADD(TV, TX);
|
||||
T1b = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T19 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T1a = VCONJ(T19);
|
||||
T1c = VSUB(T1a, T1b);
|
||||
T1F = VADD(T1a, T1b);
|
||||
T11 = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TZ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T10 = VCONJ(TZ);
|
||||
T12 = VSUB(T10, T11);
|
||||
T1J = VADD(T10, T11);
|
||||
T14 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T15 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
T16 = VCONJ(T15);
|
||||
T17 = VSUB(T14, T16);
|
||||
T1G = VADD(T14, T16);
|
||||
T1H = VSUB(T1F, T1G);
|
||||
T1K = VSUB(T1I, T1J);
|
||||
T1L = VFMA(LDK(KP923879532), T1H, VMUL(LDK(KP382683432), T1K));
|
||||
T25 = VFNMS(LDK(KP382683432), T1H, VMUL(LDK(KP923879532), T1K));
|
||||
T3J = VADD(T1F, T1G);
|
||||
T3K = VADD(T1I, T1J);
|
||||
T3L = VSUB(T3J, T3K);
|
||||
T4q = VADD(T3J, T3K);
|
||||
T13 = VMUL(LDK(KP707106781), VSUB(TY, T12));
|
||||
T18 = VSUB(T13, T17);
|
||||
T2i = VADD(T17, T13);
|
||||
T1d = VMUL(LDK(KP707106781), VADD(TY, T12));
|
||||
T1e = VSUB(T1c, T1d);
|
||||
T2j = VADD(T1c, T1d);
|
||||
T1f = VFNMS(LDK(KP555570233), T1e, VMUL(LDK(KP831469612), T18));
|
||||
T2o = VFMA(LDK(KP195090322), T2i, VMUL(LDK(KP980785280), T2j));
|
||||
T1j = VFMA(LDK(KP555570233), T18, VMUL(LDK(KP831469612), T1e));
|
||||
T2k = VFNMS(LDK(KP195090322), T2j, VMUL(LDK(KP980785280), T2i));
|
||||
}
|
||||
{
|
||||
V T4L, T4G, T4s, T4y, T3W, T4g, T42, T4a, T3g, T4e, T3o, T3E, T1w, T46, T2M;
|
||||
V T40, T2u, T4w, T2C, T4k, T36, T3A, T3i, T3s, T28, T2O, T2w, T2G, T2Y, T4K;
|
||||
V T3y, T4C;
|
||||
{
|
||||
V T4E, T4F, T4D, T4o, T4r, T4l, T4x, T3Q, T48, T3V, T49, T3M, T3U, T3F, T4f;
|
||||
V T41, T47, T3c, T3n, T3f, T3m, T3a, T3b, T3d, T3e, T39, T4d, T3l, T3D, T1h;
|
||||
V T2K, T1v, T2L, Tz, T1g, T1k, T1u, T1, T45, T2J, T3Z, T2m, T2A, T2t, T2B;
|
||||
V T2e, T2l, T2p, T2s, T2b, T4v, T2z, T4j;
|
||||
T4E = VADD(T4m, T4n);
|
||||
T4F = VADD(T4p, T4q);
|
||||
T4L = VADD(T4E, T4F);
|
||||
T4D = LDW(&(W[TWVL * 30]));
|
||||
T4G = VZMUL(T4D, VSUB(T4E, T4F));
|
||||
T4o = VSUB(T4m, T4n);
|
||||
T4r = VBYI(VSUB(T4p, T4q));
|
||||
T4l = LDW(&(W[TWVL * 46]));
|
||||
T4s = VZMUL(T4l, VSUB(T4o, T4r));
|
||||
T4x = LDW(&(W[TWVL * 14]));
|
||||
T4y = VZMUL(T4x, VADD(T4o, T4r));
|
||||
T3M = VMUL(LDK(KP707106781), VSUB(T3I, T3L));
|
||||
T3Q = VBYI(VSUB(T3M, T3P));
|
||||
T48 = VBYI(VADD(T3P, T3M));
|
||||
T3U = VMUL(LDK(KP707106781), VADD(T3I, T3L));
|
||||
T3V = VSUB(T3T, T3U);
|
||||
T49 = VADD(T3T, T3U);
|
||||
T3F = LDW(&(W[TWVL * 22]));
|
||||
T3W = VZMUL(T3F, VADD(T3Q, T3V));
|
||||
T4f = LDW(&(W[TWVL * 54]));
|
||||
T4g = VZMUL(T4f, VSUB(T49, T48));
|
||||
T41 = LDW(&(W[TWVL * 38]));
|
||||
T42 = VZMUL(T41, VSUB(T3V, T3Q));
|
||||
T47 = LDW(&(W[TWVL * 6]));
|
||||
T4a = VZMUL(T47, VADD(T48, T49));
|
||||
T3a = VADD(T1t, T1n);
|
||||
T3b = VADD(TU, T1f);
|
||||
T3c = VBYI(VADD(T3a, T3b));
|
||||
T3n = VBYI(VSUB(T3b, T3a));
|
||||
T3d = VADD(Tf, Ty);
|
||||
T3e = VADD(T1i, T1j);
|
||||
T3f = VADD(T3d, T3e);
|
||||
T3m = VSUB(T3d, T3e);
|
||||
T39 = LDW(&(W[TWVL * 4]));
|
||||
T3g = VZMULI(T39, VADD(T3c, T3f));
|
||||
T4d = LDW(&(W[TWVL * 56]));
|
||||
T4e = VZMULI(T4d, VSUB(T3f, T3c));
|
||||
T3l = LDW(&(W[TWVL * 36]));
|
||||
T3o = VZMULI(T3l, VSUB(T3m, T3n));
|
||||
T3D = LDW(&(W[TWVL * 24]));
|
||||
T3E = VZMULI(T3D, VADD(T3n, T3m));
|
||||
Tz = VSUB(Tf, Ty);
|
||||
T1g = VSUB(TU, T1f);
|
||||
T1h = VSUB(Tz, T1g);
|
||||
T2K = VADD(Tz, T1g);
|
||||
T1k = VSUB(T1i, T1j);
|
||||
T1u = VSUB(T1n, T1t);
|
||||
T1v = VBYI(VSUB(T1k, T1u));
|
||||
T2L = VBYI(VADD(T1u, T1k));
|
||||
T1 = LDW(&(W[TWVL * 20]));
|
||||
T1w = VZMULI(T1, VADD(T1h, T1v));
|
||||
T45 = LDW(&(W[TWVL * 8]));
|
||||
T46 = VZMULI(T45, VADD(T2K, T2L));
|
||||
T2J = LDW(&(W[TWVL * 52]));
|
||||
T2M = VZMULI(T2J, VSUB(T2K, T2L));
|
||||
T3Z = LDW(&(W[TWVL * 40]));
|
||||
T40 = VZMULI(T3Z, VSUB(T1h, T1v));
|
||||
T2e = VSUB(T2c, T2d);
|
||||
T2l = VSUB(T2h, T2k);
|
||||
T2m = VSUB(T2e, T2l);
|
||||
T2A = VADD(T2e, T2l);
|
||||
T2p = VSUB(T2n, T2o);
|
||||
T2s = VSUB(T2q, T2r);
|
||||
T2t = VBYI(VSUB(T2p, T2s));
|
||||
T2B = VBYI(VADD(T2s, T2p));
|
||||
T2b = LDW(&(W[TWVL * 44]));
|
||||
T2u = VZMULI(T2b, VSUB(T2m, T2t));
|
||||
T4v = LDW(&(W[TWVL * 16]));
|
||||
T4w = VZMULI(T4v, VADD(T2m, T2t));
|
||||
T2z = LDW(&(W[TWVL * 12]));
|
||||
T2C = VZMULI(T2z, VADD(T2A, T2B));
|
||||
T4j = LDW(&(W[TWVL * 48]));
|
||||
T4k = VZMULI(T4j, VSUB(T2A, T2B));
|
||||
{
|
||||
V T32, T3q, T35, T3r, T30, T31, T33, T34, T2Z, T3z, T3h, T3p, T1Y, T2E, T27;
|
||||
V T2F, T1M, T1X, T23, T26, T1x, T2N, T2v, T2D, T2U, T3x, T2X, T3w, T2S, T2T;
|
||||
V T2V, T2W, T2R, T4J, T3v, T4B;
|
||||
T30 = VADD(T21, T22);
|
||||
T31 = VADD(T1E, T1L);
|
||||
T32 = VADD(T30, T31);
|
||||
T3q = VSUB(T30, T31);
|
||||
T33 = VADD(T1W, T1T);
|
||||
T34 = VADD(T24, T25);
|
||||
T35 = VBYI(VADD(T33, T34));
|
||||
T3r = VBYI(VSUB(T34, T33));
|
||||
T2Z = LDW(&(W[TWVL * 58]));
|
||||
T36 = VZMUL(T2Z, VSUB(T32, T35));
|
||||
T3z = LDW(&(W[TWVL * 26]));
|
||||
T3A = VZMUL(T3z, VADD(T3q, T3r));
|
||||
T3h = LDW(&(W[TWVL * 2]));
|
||||
T3i = VZMUL(T3h, VADD(T32, T35));
|
||||
T3p = LDW(&(W[TWVL * 34]));
|
||||
T3s = VZMUL(T3p, VSUB(T3q, T3r));
|
||||
T1M = VSUB(T1E, T1L);
|
||||
T1X = VSUB(T1T, T1W);
|
||||
T1Y = VBYI(VSUB(T1M, T1X));
|
||||
T2E = VBYI(VADD(T1X, T1M));
|
||||
T23 = VSUB(T21, T22);
|
||||
T26 = VSUB(T24, T25);
|
||||
T27 = VSUB(T23, T26);
|
||||
T2F = VADD(T23, T26);
|
||||
T1x = LDW(&(W[TWVL * 18]));
|
||||
T28 = VZMUL(T1x, VADD(T1Y, T27));
|
||||
T2N = LDW(&(W[TWVL * 50]));
|
||||
T2O = VZMUL(T2N, VSUB(T2F, T2E));
|
||||
T2v = LDW(&(W[TWVL * 42]));
|
||||
T2w = VZMUL(T2v, VSUB(T27, T1Y));
|
||||
T2D = LDW(&(W[TWVL * 10]));
|
||||
T2G = VZMUL(T2D, VADD(T2E, T2F));
|
||||
T2S = VADD(T2c, T2d);
|
||||
T2T = VADD(T2n, T2o);
|
||||
T2U = VADD(T2S, T2T);
|
||||
T3x = VSUB(T2S, T2T);
|
||||
T2V = VADD(T2r, T2q);
|
||||
T2W = VADD(T2h, T2k);
|
||||
T2X = VBYI(VADD(T2V, T2W));
|
||||
T3w = VBYI(VSUB(T2W, T2V));
|
||||
T2R = LDW(&(W[TWVL * 60]));
|
||||
T2Y = VZMULI(T2R, VSUB(T2U, T2X));
|
||||
T4J = LDW(&(W[0]));
|
||||
T4K = VZMULI(T4J, VADD(T2X, T2U));
|
||||
T3v = LDW(&(W[TWVL * 28]));
|
||||
T3y = VZMULI(T3v, VADD(T3w, T3x));
|
||||
T4B = LDW(&(W[TWVL * 32]));
|
||||
T4C = VZMULI(T4B, VSUB(T3x, T3w));
|
||||
}
|
||||
}
|
||||
{
|
||||
V T29, T4M, T2P, T4t, T4N, T2a, T4u, T2Q, T2x, T4H, T2H, T4z, T4I, T2y, T4A;
|
||||
V T2I, T37, T4h, T3B, T3X, T4i, T38, T3Y, T3C, T3j, T4b, T3t, T43, T4c, T3k;
|
||||
V T44, T3u;
|
||||
T29 = VADD(T1w, T28);
|
||||
ST(&(Rp[WS(rs, 5)]), T29, ms, &(Rp[WS(rs, 1)]));
|
||||
T4M = VADD(T4K, T4L);
|
||||
ST(&(Rp[0]), T4M, ms, &(Rp[0]));
|
||||
T2P = VADD(T2M, T2O);
|
||||
ST(&(Rp[WS(rs, 13)]), T2P, ms, &(Rp[WS(rs, 1)]));
|
||||
T4t = VADD(T4k, T4s);
|
||||
ST(&(Rp[WS(rs, 12)]), T4t, ms, &(Rp[0]));
|
||||
T4N = VCONJ(VSUB(T4L, T4K));
|
||||
ST(&(Rm[0]), T4N, -ms, &(Rm[0]));
|
||||
T2a = VCONJ(VSUB(T28, T1w));
|
||||
ST(&(Rm[WS(rs, 5)]), T2a, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4u = VCONJ(VSUB(T4s, T4k));
|
||||
ST(&(Rm[WS(rs, 12)]), T4u, -ms, &(Rm[0]));
|
||||
T2Q = VCONJ(VSUB(T2O, T2M));
|
||||
ST(&(Rm[WS(rs, 13)]), T2Q, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2x = VADD(T2u, T2w);
|
||||
ST(&(Rp[WS(rs, 11)]), T2x, ms, &(Rp[WS(rs, 1)]));
|
||||
T4H = VADD(T4C, T4G);
|
||||
ST(&(Rp[WS(rs, 8)]), T4H, ms, &(Rp[0]));
|
||||
T2H = VADD(T2C, T2G);
|
||||
ST(&(Rp[WS(rs, 3)]), T2H, ms, &(Rp[WS(rs, 1)]));
|
||||
T4z = VADD(T4w, T4y);
|
||||
ST(&(Rp[WS(rs, 4)]), T4z, ms, &(Rp[0]));
|
||||
T4I = VCONJ(VSUB(T4G, T4C));
|
||||
ST(&(Rm[WS(rs, 8)]), T4I, -ms, &(Rm[0]));
|
||||
T2y = VCONJ(VSUB(T2w, T2u));
|
||||
ST(&(Rm[WS(rs, 11)]), T2y, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4A = VCONJ(VSUB(T4y, T4w));
|
||||
ST(&(Rm[WS(rs, 4)]), T4A, -ms, &(Rm[0]));
|
||||
T2I = VCONJ(VSUB(T2G, T2C));
|
||||
ST(&(Rm[WS(rs, 3)]), T2I, -ms, &(Rm[WS(rs, 1)]));
|
||||
T37 = VADD(T2Y, T36);
|
||||
ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
|
||||
T4h = VADD(T4e, T4g);
|
||||
ST(&(Rp[WS(rs, 14)]), T4h, ms, &(Rp[0]));
|
||||
T3B = VADD(T3y, T3A);
|
||||
ST(&(Rp[WS(rs, 7)]), T3B, ms, &(Rp[WS(rs, 1)]));
|
||||
T3X = VADD(T3E, T3W);
|
||||
ST(&(Rp[WS(rs, 6)]), T3X, ms, &(Rp[0]));
|
||||
T4i = VCONJ(VSUB(T4g, T4e));
|
||||
ST(&(Rm[WS(rs, 14)]), T4i, -ms, &(Rm[0]));
|
||||
T38 = VCONJ(VSUB(T36, T2Y));
|
||||
ST(&(Rm[WS(rs, 15)]), T38, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3Y = VCONJ(VSUB(T3W, T3E));
|
||||
ST(&(Rm[WS(rs, 6)]), T3Y, -ms, &(Rm[0]));
|
||||
T3C = VCONJ(VSUB(T3A, T3y));
|
||||
ST(&(Rm[WS(rs, 7)]), T3C, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3j = VADD(T3g, T3i);
|
||||
ST(&(Rp[WS(rs, 1)]), T3j, ms, &(Rp[WS(rs, 1)]));
|
||||
T4b = VADD(T46, T4a);
|
||||
ST(&(Rp[WS(rs, 2)]), T4b, ms, &(Rp[0]));
|
||||
T3t = VADD(T3o, T3s);
|
||||
ST(&(Rp[WS(rs, 9)]), T3t, ms, &(Rp[WS(rs, 1)]));
|
||||
T43 = VADD(T40, T42);
|
||||
ST(&(Rp[WS(rs, 10)]), T43, ms, &(Rp[0]));
|
||||
T4c = VCONJ(VSUB(T4a, T46));
|
||||
ST(&(Rm[WS(rs, 2)]), T4c, -ms, &(Rm[0]));
|
||||
T3k = VCONJ(VSUB(T3i, T3g));
|
||||
ST(&(Rm[WS(rs, 1)]), T3k, -ms, &(Rm[WS(rs, 1)]));
|
||||
T44 = VCONJ(VSUB(T42, T40));
|
||||
ST(&(Rm[WS(rs, 10)]), T44, -ms, &(Rm[0]));
|
||||
T3u = VCONJ(VSUB(T3s, T3o));
|
||||
ST(&(Rm[WS(rs, 9)]), T3u, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
VTW(1, 20),
|
||||
VTW(1, 21),
|
||||
VTW(1, 22),
|
||||
VTW(1, 23),
|
||||
VTW(1, 24),
|
||||
VTW(1, 25),
|
||||
VTW(1, 26),
|
||||
VTW(1, 27),
|
||||
VTW(1, 28),
|
||||
VTW(1, 29),
|
||||
VTW(1, 30),
|
||||
VTW(1, 31),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, { 233, 88, 16, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
144
fftw-3.3.10/rdft/simd/common/hc2cbdftv_4.c
Normal file
144
fftw-3.3.10/rdft/simd/common/hc2cbdftv_4.c
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dif -sign 1 -name hc2cbdftv_4 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 15 FP additions, 12 FP multiplications,
|
||||
* (or, 9 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 20 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
V Th, Tg, T8, Tc, T4, Ta, T7, Tb, T2, T3, T5, T6, Tf, T1, T9;
|
||||
V Td, Tj, Te, Ti;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VFNMSCONJ(T3, T2);
|
||||
Ta = VFMACONJ(T3, T2);
|
||||
T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T6 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T7 = VFNMSCONJ(T6, T5);
|
||||
Tb = VFMACONJ(T6, T5);
|
||||
Th = VADD(Ta, Tb);
|
||||
Tf = LDW(&(W[0]));
|
||||
Tg = VZMULI(Tf, VFMAI(T7, T4));
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
T8 = VZMULI(T1, VFNMSI(T7, T4));
|
||||
T9 = LDW(&(W[TWVL * 2]));
|
||||
Tc = VZMUL(T9, VSUB(Ta, Tb));
|
||||
Td = VADD(T8, Tc);
|
||||
ST(&(Rp[WS(rs, 1)]), Td, ms, &(Rp[WS(rs, 1)]));
|
||||
Tj = VCONJ(VSUB(Th, Tg));
|
||||
ST(&(Rm[0]), Tj, -ms, &(Rm[0]));
|
||||
Te = VCONJ(VSUB(Tc, T8));
|
||||
ST(&(Rm[WS(rs, 1)]), Te, -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = VADD(Tg, Th);
|
||||
ST(&(Rp[0]), Ti, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cbdftv_4"), twinstr, &GENUS, { 9, 6, 6, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dif -sign 1 -name hc2cbdftv_4 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 15 FP additions, 6 FP multiplications,
|
||||
* (or, 15 additions, 6 multiplications, 0 fused multiply/add),
|
||||
* 22 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
V T5, Tc, T9, Td, T2, T4, T3, T6, T8, T7, Tj, Ti, Th, Tk, Tl;
|
||||
V Ta, Te, T1, Tb, Tf, Tg;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VCONJ(T3);
|
||||
T5 = VSUB(T2, T4);
|
||||
Tc = VADD(T2, T4);
|
||||
T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T7 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
T9 = VBYI(VSUB(T6, T8));
|
||||
Td = VADD(T6, T8);
|
||||
Tj = VADD(Tc, Td);
|
||||
Th = LDW(&(W[0]));
|
||||
Ti = VZMULI(Th, VADD(T5, T9));
|
||||
Tk = VADD(Ti, Tj);
|
||||
ST(&(Rp[0]), Tk, ms, &(Rp[0]));
|
||||
Tl = VCONJ(VSUB(Tj, Ti));
|
||||
ST(&(Rm[0]), Tl, -ms, &(Rm[0]));
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
Ta = VZMULI(T1, VSUB(T5, T9));
|
||||
Tb = LDW(&(W[TWVL * 2]));
|
||||
Te = VZMUL(Tb, VSUB(Tc, Td));
|
||||
Tf = VADD(Ta, Te);
|
||||
ST(&(Rp[WS(rs, 1)]), Tf, ms, &(Rp[WS(rs, 1)]));
|
||||
Tg = VCONJ(VSUB(Te, Ta));
|
||||
ST(&(Rm[WS(rs, 1)]), Tg, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cbdftv_4"), twinstr, &GENUS, { 15, 6, 0, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
191
fftw-3.3.10/rdft/simd/common/hc2cbdftv_6.c
Normal file
191
fftw-3.3.10/rdft/simd/common/hc2cbdftv_6.c
Normal file
@@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dif -sign 1 -name hc2cbdftv_6 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 29 FP additions, 24 FP multiplications,
|
||||
* (or, 17 additions, 12 multiplications, 12 fused multiply/add),
|
||||
* 38 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
V T4, Te, Tj, Tp, Tb, To, Th, Ti, Ta, Tg, T7, Tf, T2, T3, T8;
|
||||
V T9, T5, T6, Tx, Tw, Tv, Ty, Tz, Tq, Ts, Tn, Tr, Tt, Tu, Tc;
|
||||
V Tk, T1, Td, Tl, Tm;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T4 = VFNMSCONJ(T3, T2);
|
||||
Te = VFMACONJ(T3, T2);
|
||||
T8 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T9 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ta = VFMSCONJ(T9, T8);
|
||||
Tg = VFMACONJ(T9, T8);
|
||||
T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T7 = VFNMSCONJ(T6, T5);
|
||||
Tf = VFMACONJ(T6, T5);
|
||||
Tj = VMUL(LDK(KP866025403), VSUB(Tf, Tg));
|
||||
Tp = VMUL(LDK(KP866025403), VSUB(T7, Ta));
|
||||
Tb = VADD(T7, Ta);
|
||||
To = VFNMS(LDK(KP500000000), Tb, T4);
|
||||
Th = VADD(Tf, Tg);
|
||||
Ti = VFNMS(LDK(KP500000000), Th, Te);
|
||||
Tx = VADD(Te, Th);
|
||||
Tv = LDW(&(W[0]));
|
||||
Tw = VZMULI(Tv, VFMAI(Tp, To));
|
||||
Ty = VADD(Tw, Tx);
|
||||
ST(&(Rp[0]), Ty, ms, &(Rp[0]));
|
||||
Tz = VCONJ(VSUB(Tx, Tw));
|
||||
ST(&(Rm[0]), Tz, -ms, &(Rm[0]));
|
||||
Tn = LDW(&(W[TWVL * 8]));
|
||||
Tq = VZMULI(Tn, VFNMSI(Tp, To));
|
||||
Tr = LDW(&(W[TWVL * 6]));
|
||||
Ts = VZMUL(Tr, VFMAI(Tj, Ti));
|
||||
Tt = VADD(Tq, Ts);
|
||||
ST(&(Rp[WS(rs, 2)]), Tt, ms, &(Rp[0]));
|
||||
Tu = VCONJ(VSUB(Ts, Tq));
|
||||
ST(&(Rm[WS(rs, 2)]), Tu, -ms, &(Rm[0]));
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
Tc = VZMULI(T1, VADD(T4, Tb));
|
||||
Td = LDW(&(W[TWVL * 2]));
|
||||
Tk = VZMUL(Td, VFNMSI(Tj, Ti));
|
||||
Tl = VADD(Tc, Tk);
|
||||
ST(&(Rp[WS(rs, 1)]), Tl, ms, &(Rp[WS(rs, 1)]));
|
||||
Tm = VCONJ(VSUB(Tk, Tc));
|
||||
ST(&(Rm[WS(rs, 1)]), Tm, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cbdftv_6"), twinstr, &GENUS, { 17, 12, 12, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_6, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dif -sign 1 -name hc2cbdftv_6 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 29 FP additions, 14 FP multiplications,
|
||||
* (or, 27 additions, 12 multiplications, 2 fused multiply/add),
|
||||
* 41 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
V T5, Th, Te, Ts, Tk, Tm, T2, T4, T3, T6, Tc, T8, Tb, T7, Ta;
|
||||
V T9, Td, Ti, Tj, TA, Tf, Tn, Tv, Tt, Tz, T1, Tl, Tg, Tu, Tr;
|
||||
V Tq, Ty, To, Tp, TC, TB, Tx, Tw;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T4 = VCONJ(T3);
|
||||
T5 = VSUB(T2, T4);
|
||||
Th = VADD(T2, T4);
|
||||
T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T7 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tb = VCONJ(Ta);
|
||||
T9 = VSUB(T6, T8);
|
||||
Td = VSUB(Tb, Tc);
|
||||
Te = VADD(T9, Td);
|
||||
Ts = VBYI(VMUL(LDK(KP866025403), VSUB(T9, Td)));
|
||||
Ti = VADD(T6, T8);
|
||||
Tj = VADD(Tb, Tc);
|
||||
Tk = VADD(Ti, Tj);
|
||||
Tm = VBYI(VMUL(LDK(KP866025403), VSUB(Ti, Tj)));
|
||||
TA = VADD(Th, Tk);
|
||||
T1 = LDW(&(W[TWVL * 4]));
|
||||
Tf = VZMULI(T1, VADD(T5, Te));
|
||||
Tl = VFNMS(LDK(KP500000000), Tk, Th);
|
||||
Tg = LDW(&(W[TWVL * 2]));
|
||||
Tn = VZMUL(Tg, VSUB(Tl, Tm));
|
||||
Tu = LDW(&(W[TWVL * 6]));
|
||||
Tv = VZMUL(Tu, VADD(Tm, Tl));
|
||||
Tr = VFNMS(LDK(KP500000000), Te, T5);
|
||||
Tq = LDW(&(W[TWVL * 8]));
|
||||
Tt = VZMULI(Tq, VSUB(Tr, Ts));
|
||||
Ty = LDW(&(W[0]));
|
||||
Tz = VZMULI(Ty, VADD(Ts, Tr));
|
||||
To = VADD(Tf, Tn);
|
||||
ST(&(Rp[WS(rs, 1)]), To, ms, &(Rp[WS(rs, 1)]));
|
||||
Tp = VCONJ(VSUB(Tn, Tf));
|
||||
ST(&(Rm[WS(rs, 1)]), Tp, -ms, &(Rm[WS(rs, 1)]));
|
||||
TC = VCONJ(VSUB(TA, Tz));
|
||||
ST(&(Rm[0]), TC, -ms, &(Rm[0]));
|
||||
TB = VADD(Tz, TA);
|
||||
ST(&(Rp[0]), TB, ms, &(Rp[0]));
|
||||
Tx = VCONJ(VSUB(Tv, Tt));
|
||||
ST(&(Rm[WS(rs, 2)]), Tx, -ms, &(Rm[0]));
|
||||
Tw = VADD(Tt, Tv);
|
||||
ST(&(Rp[WS(rs, 2)]), Tw, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cbdftv_6"), twinstr, &GENUS, { 27, 12, 2, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_6, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
228
fftw-3.3.10/rdft/simd/common/hc2cbdftv_8.c
Normal file
228
fftw-3.3.10/rdft/simd/common/hc2cbdftv_8.c
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dif -sign 1 -name hc2cbdftv_8 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 41 FP additions, 32 FP multiplications,
|
||||
* (or, 23 additions, 14 multiplications, 18 fused multiply/add),
|
||||
* 51 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
V Tm, Tp, TF, TE, Th, Tv, Tc, Tu, T4, Tk, Tf, Tl, T7, Tn, Ta;
|
||||
V To, T2, T3, Td, Te, T5, T6, T8, T9, Tg, Tb, TL, TK, TJ, TM;
|
||||
V TN, TC, TG, TB, TD, TH, TI, Ti, Tq, T1, Tj, Tr, Ts, Tw, Ty;
|
||||
V Tt, Tx, Tz, TA;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VFNMSCONJ(T3, T2);
|
||||
Tk = VFMACONJ(T3, T2);
|
||||
Td = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Te = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tf = VFNMSCONJ(Te, Td);
|
||||
Tl = VFMACONJ(Te, Td);
|
||||
T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T7 = VFNMSCONJ(T6, T5);
|
||||
Tn = VFMACONJ(T6, T5);
|
||||
T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T9 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Ta = VFMSCONJ(T9, T8);
|
||||
To = VFMACONJ(T9, T8);
|
||||
Tm = VSUB(Tk, Tl);
|
||||
Tp = VSUB(Tn, To);
|
||||
TF = VADD(Tn, To);
|
||||
TE = VADD(Tk, Tl);
|
||||
Tg = VSUB(T7, Ta);
|
||||
Th = VFMA(LDK(KP707106781), Tg, Tf);
|
||||
Tv = VFNMS(LDK(KP707106781), Tg, Tf);
|
||||
Tb = VADD(T7, Ta);
|
||||
Tc = VFMA(LDK(KP707106781), Tb, T4);
|
||||
Tu = VFNMS(LDK(KP707106781), Tb, T4);
|
||||
TL = VADD(TE, TF);
|
||||
TJ = LDW(&(W[0]));
|
||||
TK = VZMULI(TJ, VFMAI(Th, Tc));
|
||||
TM = VADD(TK, TL);
|
||||
ST(&(Rp[0]), TM, ms, &(Rp[0]));
|
||||
TN = VCONJ(VSUB(TL, TK));
|
||||
ST(&(Rm[0]), TN, -ms, &(Rm[0]));
|
||||
TB = LDW(&(W[TWVL * 8]));
|
||||
TC = VZMULI(TB, VFMAI(Tv, Tu));
|
||||
TD = LDW(&(W[TWVL * 6]));
|
||||
TG = VZMUL(TD, VSUB(TE, TF));
|
||||
TH = VADD(TC, TG);
|
||||
ST(&(Rp[WS(rs, 2)]), TH, ms, &(Rp[0]));
|
||||
TI = VCONJ(VSUB(TG, TC));
|
||||
ST(&(Rm[WS(rs, 2)]), TI, -ms, &(Rm[0]));
|
||||
T1 = LDW(&(W[TWVL * 12]));
|
||||
Ti = VZMULI(T1, VFNMSI(Th, Tc));
|
||||
Tj = LDW(&(W[TWVL * 10]));
|
||||
Tq = VZMUL(Tj, VFNMSI(Tp, Tm));
|
||||
Tr = VADD(Ti, Tq);
|
||||
ST(&(Rp[WS(rs, 3)]), Tr, ms, &(Rp[WS(rs, 1)]));
|
||||
Ts = VCONJ(VSUB(Tq, Ti));
|
||||
ST(&(Rm[WS(rs, 3)]), Ts, -ms, &(Rm[WS(rs, 1)]));
|
||||
Tt = LDW(&(W[TWVL * 4]));
|
||||
Tw = VZMULI(Tt, VFNMSI(Tv, Tu));
|
||||
Tx = LDW(&(W[TWVL * 2]));
|
||||
Ty = VZMUL(Tx, VFMAI(Tp, Tm));
|
||||
Tz = VADD(Tw, Ty);
|
||||
ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)]));
|
||||
TA = VCONJ(VSUB(Ty, Tw));
|
||||
ST(&(Rm[WS(rs, 1)]), TA, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cbdftv_8"), twinstr, &GENUS, { 23, 14, 18, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dif -sign 1 -name hc2cbdftv_8 -include rdft/simd/hc2cbv.h */
|
||||
|
||||
/*
|
||||
* This function contains 41 FP additions, 16 FP multiplications,
|
||||
* (or, 41 additions, 16 multiplications, 0 fused multiply/add),
|
||||
* 55 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cbv.h"
|
||||
|
||||
static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
V T5, Tj, Tq, TI, Te, Tk, Tt, TJ, T2, Tg, T4, Ti, T3, Th, To;
|
||||
V Tp, T6, Tc, T8, Tb, T7, Ta, T9, Td, Tr, Ts, TP, Tu, Tm, TO;
|
||||
V Tn, Tf, Tl, T1, TN, Tv, TR, Tw, TQ, TC, TK, TA, TG, TB, TH;
|
||||
V Ty, Tz, Tx, TF, TD, TM, TE, TL;
|
||||
T2 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = VCONJ(T3);
|
||||
Th = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = VCONJ(Th);
|
||||
T5 = VSUB(T2, T4);
|
||||
Tj = VSUB(Tg, Ti);
|
||||
To = VADD(T2, T4);
|
||||
Tp = VADD(Tg, Ti);
|
||||
Tq = VSUB(To, Tp);
|
||||
TI = VADD(To, Tp);
|
||||
T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
Ta = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Tb = VCONJ(Ta);
|
||||
T9 = VSUB(T6, T8);
|
||||
Td = VSUB(Tb, Tc);
|
||||
Te = VMUL(LDK(KP707106781), VADD(T9, Td));
|
||||
Tk = VMUL(LDK(KP707106781), VSUB(T9, Td));
|
||||
Tr = VADD(T6, T8);
|
||||
Ts = VADD(Tb, Tc);
|
||||
Tt = VBYI(VSUB(Tr, Ts));
|
||||
TJ = VADD(Tr, Ts);
|
||||
TP = VADD(TI, TJ);
|
||||
Tn = LDW(&(W[TWVL * 10]));
|
||||
Tu = VZMUL(Tn, VSUB(Tq, Tt));
|
||||
Tf = VADD(T5, Te);
|
||||
Tl = VBYI(VADD(Tj, Tk));
|
||||
T1 = LDW(&(W[TWVL * 12]));
|
||||
Tm = VZMULI(T1, VSUB(Tf, Tl));
|
||||
TN = LDW(&(W[0]));
|
||||
TO = VZMULI(TN, VADD(Tl, Tf));
|
||||
Tv = VADD(Tm, Tu);
|
||||
ST(&(Rp[WS(rs, 3)]), Tv, ms, &(Rp[WS(rs, 1)]));
|
||||
TR = VCONJ(VSUB(TP, TO));
|
||||
ST(&(Rm[0]), TR, -ms, &(Rm[0]));
|
||||
Tw = VCONJ(VSUB(Tu, Tm));
|
||||
ST(&(Rm[WS(rs, 3)]), Tw, -ms, &(Rm[WS(rs, 1)]));
|
||||
TQ = VADD(TO, TP);
|
||||
ST(&(Rp[0]), TQ, ms, &(Rp[0]));
|
||||
TB = LDW(&(W[TWVL * 2]));
|
||||
TC = VZMUL(TB, VADD(Tq, Tt));
|
||||
TH = LDW(&(W[TWVL * 6]));
|
||||
TK = VZMUL(TH, VSUB(TI, TJ));
|
||||
Ty = VBYI(VSUB(Tk, Tj));
|
||||
Tz = VSUB(T5, Te);
|
||||
Tx = LDW(&(W[TWVL * 4]));
|
||||
TA = VZMULI(Tx, VADD(Ty, Tz));
|
||||
TF = LDW(&(W[TWVL * 8]));
|
||||
TG = VZMULI(TF, VSUB(Tz, Ty));
|
||||
TD = VADD(TA, TC);
|
||||
ST(&(Rp[WS(rs, 1)]), TD, ms, &(Rp[WS(rs, 1)]));
|
||||
TM = VCONJ(VSUB(TK, TG));
|
||||
ST(&(Rm[WS(rs, 2)]), TM, -ms, &(Rm[0]));
|
||||
TE = VCONJ(VSUB(TC, TA));
|
||||
ST(&(Rm[WS(rs, 1)]), TE, -ms, &(Rm[WS(rs, 1)]));
|
||||
TL = VADD(TG, TK);
|
||||
ST(&(Rp[WS(rs, 2)]), TL, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cbdftv_8"), twinstr, &GENUS, { 41, 16, 0, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cbdftv_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cbdftv_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
297
fftw-3.3.10/rdft/simd/common/hc2cfdftv_10.c
Normal file
297
fftw-3.3.10/rdft/simd/common/hc2cfdftv_10.c
Normal file
@@ -0,0 +1,297 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 61 FP additions, 60 FP multiplications,
|
||||
* (or, 33 additions, 32 multiplications, 28 fused multiply/add),
|
||||
* 77 stack variables, 5 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
V T8, T11, T12, TG, TH, TP, Tp, TA, TB, TS, TV, TW, TC, TX, TI;
|
||||
V TM, TF, TL, TD, TE, TJ, TO, TK, TN, T13, T17, T10, T16, TY, TZ;
|
||||
V T14, T19, T15, T18;
|
||||
{
|
||||
V T3, To, TU, Th, TT, TR, Tz, Tu, TQ, T7, T1, T2, Tw, T5, T6;
|
||||
V Tr, Tc, Tj, Tg, Ty, Tn, Tt, Tv, Tq, Ta, Tb, T9, Ti, Te, Tf;
|
||||
V Td, Tx, Tl, Tm, Tk, Ts, T4;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Tv = LDW(&(W[0]));
|
||||
Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
|
||||
T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tq = LDW(&(W[TWVL * 6]));
|
||||
Tr = VZMULJ(Tq, VFMACONJ(T6, T5));
|
||||
Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T9 = LDW(&(W[TWVL * 2]));
|
||||
Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
|
||||
Ti = LDW(&(W[TWVL * 4]));
|
||||
Tj = VZMULIJ(Ti, VFNMSCONJ(Tb, Ta));
|
||||
Te = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tf = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Td = LDW(&(W[TWVL * 12]));
|
||||
Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
|
||||
Tx = LDW(&(W[TWVL * 10]));
|
||||
Ty = VZMULJ(Tx, VFMACONJ(Tf, Te));
|
||||
Tl = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Tm = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Tk = LDW(&(W[TWVL * 14]));
|
||||
Tn = VZMULJ(Tk, VFMACONJ(Tm, Tl));
|
||||
Ts = LDW(&(W[TWVL * 16]));
|
||||
Tt = VZMULIJ(Ts, VFNMSCONJ(Tm, Tl));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
To = VSUB(Tj, Tn);
|
||||
TU = VADD(Tr, Tt);
|
||||
Th = VSUB(Tc, Tg);
|
||||
TT = VADD(Tw, Ty);
|
||||
TR = VADD(Tj, Tn);
|
||||
Tz = VSUB(Tw, Ty);
|
||||
Tu = VSUB(Tr, Tt);
|
||||
TQ = VADD(Tc, Tg);
|
||||
T4 = LDW(&(W[TWVL * 8]));
|
||||
T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
|
||||
T8 = VSUB(T3, T7);
|
||||
T11 = VSUB(TQ, TR);
|
||||
T12 = VSUB(TU, TT);
|
||||
TG = VADD(Tz, Tu);
|
||||
TH = VADD(Th, To);
|
||||
TP = VADD(T3, T7);
|
||||
Tp = VSUB(Th, To);
|
||||
TA = VSUB(Tu, Tz);
|
||||
TB = VADD(Tp, TA);
|
||||
TS = VADD(TQ, TR);
|
||||
TV = VADD(TT, TU);
|
||||
TW = VADD(TS, TV);
|
||||
}
|
||||
TC = VMUL(LDK(KP500000000), VADD(T8, TB));
|
||||
ST(&(Rp[0]), TC, ms, &(Rp[0]));
|
||||
TX = VCONJ(VMUL(LDK(KP500000000), VADD(TP, TW)));
|
||||
ST(&(Rm[WS(rs, 4)]), TX, -ms, &(Rm[0]));
|
||||
TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TH, TG));
|
||||
TM = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TG, TH));
|
||||
TD = VFNMS(LDK(KP250000000), TB, T8);
|
||||
TE = VSUB(Tp, TA);
|
||||
TF = VFNMS(LDK(KP559016994), TE, TD);
|
||||
TL = VFMA(LDK(KP559016994), TE, TD);
|
||||
TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TI, TF)));
|
||||
ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
|
||||
TO = VMUL(LDK(KP500000000), VFMAI(TM, TL));
|
||||
ST(&(Rp[WS(rs, 4)]), TO, ms, &(Rp[0]));
|
||||
TK = VMUL(LDK(KP500000000), VFMAI(TI, TF));
|
||||
ST(&(Rp[WS(rs, 2)]), TK, ms, &(Rp[0]));
|
||||
TN = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TM, TL)));
|
||||
ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
|
||||
T13 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T12, T11));
|
||||
T17 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T11, T12));
|
||||
TY = VFNMS(LDK(KP250000000), TW, TP);
|
||||
TZ = VSUB(TS, TV);
|
||||
T10 = VFMA(LDK(KP559016994), TZ, TY);
|
||||
T16 = VFNMS(LDK(KP559016994), TZ, TY);
|
||||
T14 = VMUL(LDK(KP500000000), VFNMSI(T13, T10));
|
||||
ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
|
||||
T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
|
||||
ST(&(Rm[WS(rs, 2)]), T19, -ms, &(Rm[0]));
|
||||
T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, T10)));
|
||||
ST(&(Rm[0]), T15, -ms, &(Rm[0]));
|
||||
T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
|
||||
ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, { 33, 32, 28, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 61 FP additions, 38 FP multiplications,
|
||||
* (or, 55 additions, 32 multiplications, 6 fused multiply/add),
|
||||
* 82 stack variables, 5 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
|
||||
DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
|
||||
DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
V Tl, Tt, Tu, TY, TZ, T10, Tz, TE, TF, TV, TW, TX, Ta, TU, TN;
|
||||
V TR, TH, TQ, TK, TL, TM, TI, TG, TJ, TT, TO, TP, TS, T18, T1c;
|
||||
V T12, T1b, T15, T16, T17, T14, T11, T13, T1e, T19, T1a, T1d;
|
||||
{
|
||||
V T1, T3, Ty, T8, T7, TB, Tf, Ts, Tk, Tw, Tq, TD, T2, Tx, T6;
|
||||
V TA, Tc, Te, Td, Tb, Tr, Tj, Ti, Th, Tg, Tv, Tn, Tp, To, Tm;
|
||||
V TC, T4, T9, T5;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
Tx = LDW(&(W[0]));
|
||||
Ty = VZMULIJ(Tx, VSUB(T3, T1));
|
||||
T8 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T7 = VCONJ(T6);
|
||||
TA = LDW(&(W[TWVL * 6]));
|
||||
TB = VZMULJ(TA, VADD(T7, T8));
|
||||
Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Te = VCONJ(Td);
|
||||
Tb = LDW(&(W[TWVL * 2]));
|
||||
Tf = VZMULJ(Tb, VADD(Tc, Te));
|
||||
Tr = LDW(&(W[TWVL * 4]));
|
||||
Ts = VZMULIJ(Tr, VSUB(Te, Tc));
|
||||
Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Th = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = VCONJ(Th);
|
||||
Tg = LDW(&(W[TWVL * 12]));
|
||||
Tk = VZMULIJ(Tg, VSUB(Ti, Tj));
|
||||
Tv = LDW(&(W[TWVL * 10]));
|
||||
Tw = VZMULJ(Tv, VADD(Ti, Tj));
|
||||
Tn = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
To = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Tp = VCONJ(To);
|
||||
Tm = LDW(&(W[TWVL * 14]));
|
||||
Tq = VZMULJ(Tm, VADD(Tn, Tp));
|
||||
TC = LDW(&(W[TWVL * 16]));
|
||||
TD = VZMULIJ(TC, VSUB(Tp, Tn));
|
||||
Tl = VSUB(Tf, Tk);
|
||||
Tt = VSUB(Tq, Ts);
|
||||
Tu = VADD(Tl, Tt);
|
||||
TY = VADD(Ty, Tw);
|
||||
TZ = VADD(TB, TD);
|
||||
T10 = VADD(TY, TZ);
|
||||
Tz = VSUB(Tw, Ty);
|
||||
TE = VSUB(TB, TD);
|
||||
TF = VADD(Tz, TE);
|
||||
TV = VADD(Tf, Tk);
|
||||
TW = VADD(Ts, Tq);
|
||||
TX = VADD(TV, TW);
|
||||
T4 = VADD(T1, T3);
|
||||
T5 = LDW(&(W[TWVL * 8]));
|
||||
T9 = VZMULIJ(T5, VSUB(T7, T8));
|
||||
Ta = VSUB(T4, T9);
|
||||
TU = VADD(T4, T9);
|
||||
}
|
||||
TL = VSUB(Tl, Tt);
|
||||
TM = VSUB(TE, Tz);
|
||||
TN = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM))));
|
||||
TR = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TM))));
|
||||
TI = VMUL(LDK(KP279508497), VSUB(Tu, TF));
|
||||
TG = VADD(Tu, TF);
|
||||
TJ = VFNMS(LDK(KP125000000), TG, VMUL(LDK(KP500000000), Ta));
|
||||
TH = VCONJ(VMUL(LDK(KP500000000), VADD(Ta, TG)));
|
||||
TQ = VSUB(TJ, TI);
|
||||
TK = VADD(TI, TJ);
|
||||
ST(&(Rm[WS(rs, 4)]), TH, -ms, &(Rm[0]));
|
||||
TT = VCONJ(VADD(TQ, TR));
|
||||
ST(&(Rm[WS(rs, 2)]), TT, -ms, &(Rm[0]));
|
||||
TO = VSUB(TK, TN);
|
||||
ST(&(Rp[WS(rs, 1)]), TO, ms, &(Rp[WS(rs, 1)]));
|
||||
TP = VCONJ(VADD(TK, TN));
|
||||
ST(&(Rm[0]), TP, -ms, &(Rm[0]));
|
||||
TS = VSUB(TQ, TR);
|
||||
ST(&(Rp[WS(rs, 3)]), TS, ms, &(Rp[WS(rs, 1)]));
|
||||
T16 = VSUB(TZ, TY);
|
||||
T17 = VSUB(TV, TW);
|
||||
T18 = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T17, VMUL(LDK(KP951056516), T16))));
|
||||
T1c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T16))));
|
||||
T14 = VMUL(LDK(KP279508497), VSUB(TX, T10));
|
||||
T11 = VADD(TX, T10);
|
||||
T13 = VFNMS(LDK(KP125000000), T11, VMUL(LDK(KP500000000), TU));
|
||||
T12 = VMUL(LDK(KP500000000), VADD(TU, T11));
|
||||
T1b = VADD(T14, T13);
|
||||
T15 = VSUB(T13, T14);
|
||||
ST(&(Rp[0]), T12, ms, &(Rp[0]));
|
||||
T1e = VADD(T1b, T1c);
|
||||
ST(&(Rp[WS(rs, 4)]), T1e, ms, &(Rp[0]));
|
||||
T19 = VCONJ(VSUB(T15, T18));
|
||||
ST(&(Rm[WS(rs, 1)]), T19, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1a = VADD(T15, T18);
|
||||
ST(&(Rp[WS(rs, 2)]), T1a, ms, &(Rp[0]));
|
||||
T1d = VCONJ(VSUB(T1b, T1c));
|
||||
ST(&(Rm[WS(rs, 3)]), T1d, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, { 55, 32, 6, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
330
fftw-3.3.10/rdft/simd/common/hc2cfdftv_12.c
Normal file
330
fftw-3.3.10/rdft/simd/common/hc2cfdftv_12.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 71 FP additions, 66 FP multiplications,
|
||||
* (or, 41 additions, 36 multiplications, 30 fused multiply/add),
|
||||
* 86 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
V Td, TQ, Tr, TR, TI, TY, TA, TX, T12, T1e, TV, T1d, TK, TL, Ts;
|
||||
V TJ, TO, TP, TM, TN, TW, T16, T13, T17, TS, TZ, T14, T19, T15, T18;
|
||||
V T1f, T1j, T1c, T1i, T1a, T1b, T1g, T1l, T1h, T1k;
|
||||
{
|
||||
V T3, Tu, T7, Tw, Tp, TH, Tl, TE, Th, TC, Tb, Tz, T1, T2, Tt;
|
||||
V T5, T6, T4, Tv, Tn, To, Tm, TG, Tj, Tk, Ti, TD, Tf, Tg, Te;
|
||||
V TB, T9, Ta, T8, Ty, Tc, Tq, TF, Tx, T10, T11, TT, TU;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
Tt = LDW(&(W[0]));
|
||||
Tu = VZMULIJ(Tt, VFNMSCONJ(T2, T1));
|
||||
T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T4 = LDW(&(W[TWVL * 6]));
|
||||
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
|
||||
Tv = LDW(&(W[TWVL * 8]));
|
||||
Tw = VZMULIJ(Tv, VFNMSCONJ(T6, T5));
|
||||
Tn = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tm = LDW(&(W[TWVL * 2]));
|
||||
Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
|
||||
TG = LDW(&(W[TWVL * 4]));
|
||||
TH = VZMULIJ(TG, VFNMSCONJ(To, Tn));
|
||||
Tj = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tk = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = LDW(&(W[TWVL * 18]));
|
||||
Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
|
||||
TD = LDW(&(W[TWVL * 20]));
|
||||
TE = VZMULIJ(TD, VFNMSCONJ(Tk, Tj));
|
||||
Tf = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Te = LDW(&(W[TWVL * 10]));
|
||||
Th = VZMULJ(Te, VFMACONJ(Tg, Tf));
|
||||
TB = LDW(&(W[TWVL * 12]));
|
||||
TC = VZMULIJ(TB, VFNMSCONJ(Tg, Tf));
|
||||
T9 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Ta = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T8 = LDW(&(W[TWVL * 14]));
|
||||
Tb = VZMULJ(T8, VFMACONJ(Ta, T9));
|
||||
Ty = LDW(&(W[TWVL * 16]));
|
||||
Tz = VZMULIJ(Ty, VFNMSCONJ(Ta, T9));
|
||||
Tc = VADD(T7, Tb);
|
||||
Td = VADD(T3, Tc);
|
||||
TQ = VFNMS(LDK(KP500000000), Tc, T3);
|
||||
Tq = VADD(Tl, Tp);
|
||||
Tr = VADD(Th, Tq);
|
||||
TR = VFNMS(LDK(KP500000000), Tq, Th);
|
||||
TF = VADD(TC, TE);
|
||||
TI = VADD(TF, TH);
|
||||
TY = VFNMS(LDK(KP500000000), TF, TH);
|
||||
Tx = VADD(Tu, Tw);
|
||||
TA = VADD(Tx, Tz);
|
||||
TX = VFNMS(LDK(KP500000000), Tx, Tz);
|
||||
T10 = VSUB(Tb, T7);
|
||||
T11 = VSUB(Tp, Tl);
|
||||
T12 = VSUB(T10, T11);
|
||||
T1e = VADD(T10, T11);
|
||||
TT = VSUB(TC, TE);
|
||||
TU = VSUB(Tu, Tw);
|
||||
TV = VSUB(TT, TU);
|
||||
T1d = VADD(TU, TT);
|
||||
}
|
||||
Ts = VSUB(Td, Tr);
|
||||
TJ = VSUB(TA, TI);
|
||||
TK = VMUL(LDK(KP500000000), VFMAI(TJ, Ts));
|
||||
TL = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TJ, Ts)));
|
||||
ST(&(Rp[WS(rs, 3)]), TK, ms, &(Rp[WS(rs, 1)]));
|
||||
ST(&(Rm[WS(rs, 2)]), TL, -ms, &(Rm[0]));
|
||||
TM = VADD(Td, Tr);
|
||||
TN = VADD(TA, TI);
|
||||
TO = VMUL(LDK(KP500000000), VSUB(TM, TN));
|
||||
TP = VCONJ(VMUL(LDK(KP500000000), VADD(TN, TM)));
|
||||
ST(&(Rp[0]), TO, ms, &(Rp[0]));
|
||||
ST(&(Rm[WS(rs, 5)]), TP, -ms, &(Rm[WS(rs, 1)]));
|
||||
TS = VSUB(TQ, TR);
|
||||
TW = VFMA(LDK(KP866025403), TV, TS);
|
||||
T16 = VFNMS(LDK(KP866025403), TV, TS);
|
||||
TZ = VSUB(TX, TY);
|
||||
T13 = VFNMS(LDK(KP866025403), T12, TZ);
|
||||
T17 = VFMA(LDK(KP866025403), T12, TZ);
|
||||
T14 = VMUL(LDK(KP500000000), VFNMSI(T13, TW));
|
||||
ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
|
||||
T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
|
||||
ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0]));
|
||||
T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, TW)));
|
||||
ST(&(Rm[0]), T15, -ms, &(Rm[0]));
|
||||
T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
|
||||
ST(&(Rp[WS(rs, 5)]), T18, ms, &(Rp[WS(rs, 1)]));
|
||||
T1f = VMUL(LDK(KP866025403), VSUB(T1d, T1e));
|
||||
T1j = VMUL(LDK(KP866025403), VADD(T1d, T1e));
|
||||
T1a = VADD(TX, TY);
|
||||
T1b = VADD(TQ, TR);
|
||||
T1c = VADD(T1a, T1b);
|
||||
T1i = VSUB(T1b, T1a);
|
||||
T1g = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1f, T1c)));
|
||||
ST(&(Rm[WS(rs, 1)]), T1g, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1l = VMUL(LDK(KP500000000), VFMAI(T1j, T1i));
|
||||
ST(&(Rp[WS(rs, 4)]), T1l, ms, &(Rp[0]));
|
||||
T1h = VMUL(LDK(KP500000000), VFMAI(T1f, T1c));
|
||||
ST(&(Rp[WS(rs, 2)]), T1h, ms, &(Rp[0]));
|
||||
T1k = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1j, T1i)));
|
||||
ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, { 41, 36, 30, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 71 FP additions, 41 FP multiplications,
|
||||
* (or, 67 additions, 37 multiplications, 4 fused multiply/add),
|
||||
* 58 stack variables, 4 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP433012701, +0.433012701892219323381861585376468091735701313);
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
V TX, T13, T4, Tf, TZ, TD, TF, T17, TW, T14, Tw, Tl, T10, TL, TN;
|
||||
V T16;
|
||||
{
|
||||
V T1, T3, TA, Tb, Td, Te, T9, TC, T2, Tz, Tc, Ta, T6, T8, T7;
|
||||
V T5, TB, TE, Ti, Tk, TI, Ts, Tu, Tv, Tq, TK, Tj, TH, Tt, Tr;
|
||||
V Tn, Tp, To, Tm, TJ, Th, TM;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
Tz = LDW(&(W[0]));
|
||||
TA = VZMULIJ(Tz, VSUB(T3, T1));
|
||||
Tb = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Tc = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Td = VCONJ(Tc);
|
||||
Ta = LDW(&(W[TWVL * 14]));
|
||||
Te = VZMULJ(Ta, VADD(Tb, Td));
|
||||
T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
T5 = LDW(&(W[TWVL * 6]));
|
||||
T9 = VZMULJ(T5, VADD(T6, T8));
|
||||
TB = LDW(&(W[TWVL * 8]));
|
||||
TC = VZMULIJ(TB, VSUB(T8, T6));
|
||||
TX = VSUB(TC, TA);
|
||||
T13 = VSUB(Te, T9);
|
||||
T4 = VADD(T1, T3);
|
||||
Tf = VADD(T9, Te);
|
||||
TZ = VFNMS(LDK(KP250000000), Tf, VMUL(LDK(KP500000000), T4));
|
||||
TD = VADD(TA, TC);
|
||||
TE = LDW(&(W[TWVL * 16]));
|
||||
TF = VZMULIJ(TE, VSUB(Td, Tb));
|
||||
T17 = VFNMS(LDK(KP500000000), TD, TF);
|
||||
Ti = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tk = VCONJ(Tj);
|
||||
TH = LDW(&(W[TWVL * 12]));
|
||||
TI = VZMULIJ(TH, VSUB(Tk, Ti));
|
||||
Ts = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tu = VCONJ(Tt);
|
||||
Tr = LDW(&(W[TWVL * 2]));
|
||||
Tv = VZMULJ(Tr, VADD(Ts, Tu));
|
||||
Tn = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
To = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tp = VCONJ(To);
|
||||
Tm = LDW(&(W[TWVL * 18]));
|
||||
Tq = VZMULJ(Tm, VADD(Tn, Tp));
|
||||
TJ = LDW(&(W[TWVL * 20]));
|
||||
TK = VZMULIJ(TJ, VSUB(Tp, Tn));
|
||||
TW = VSUB(TK, TI);
|
||||
T14 = VSUB(Tv, Tq);
|
||||
Tw = VADD(Tq, Tv);
|
||||
Th = LDW(&(W[TWVL * 10]));
|
||||
Tl = VZMULJ(Th, VADD(Ti, Tk));
|
||||
T10 = VFNMS(LDK(KP250000000), Tw, VMUL(LDK(KP500000000), Tl));
|
||||
TL = VADD(TI, TK);
|
||||
TM = LDW(&(W[TWVL * 4]));
|
||||
TN = VZMULIJ(TM, VSUB(Tu, Ts));
|
||||
T16 = VFNMS(LDK(KP500000000), TL, TN);
|
||||
}
|
||||
{
|
||||
V Ty, TS, TP, TT, Tg, Tx, TG, TO, TQ, TV, TR, TU, T1i, T1o, T1l;
|
||||
V T1p, T1g, T1h, T1j, T1k, T1m, T1r, T1n, T1q, T12, T1c, T19, T1d, TY, T11;
|
||||
V T15, T18, T1a, T1f, T1b, T1e;
|
||||
Tg = VADD(T4, Tf);
|
||||
Tx = VADD(Tl, Tw);
|
||||
Ty = VADD(Tg, Tx);
|
||||
TS = VSUB(Tg, Tx);
|
||||
TG = VADD(TD, TF);
|
||||
TO = VADD(TL, TN);
|
||||
TP = VADD(TG, TO);
|
||||
TT = VBYI(VSUB(TO, TG));
|
||||
TQ = VCONJ(VMUL(LDK(KP500000000), VSUB(Ty, TP)));
|
||||
ST(&(Rm[WS(rs, 5)]), TQ, -ms, &(Rm[WS(rs, 1)]));
|
||||
TV = VMUL(LDK(KP500000000), VADD(TS, TT));
|
||||
ST(&(Rp[WS(rs, 3)]), TV, ms, &(Rp[WS(rs, 1)]));
|
||||
TR = VMUL(LDK(KP500000000), VADD(Ty, TP));
|
||||
ST(&(Rp[0]), TR, ms, &(Rp[0]));
|
||||
TU = VCONJ(VMUL(LDK(KP500000000), VSUB(TS, TT)));
|
||||
ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0]));
|
||||
T1g = VADD(TX, TW);
|
||||
T1h = VADD(T13, T14);
|
||||
T1i = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(T1g, T1h))));
|
||||
T1o = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VADD(T1g, T1h))));
|
||||
T1j = VADD(TZ, T10);
|
||||
T1k = VMUL(LDK(KP500000000), VADD(T17, T16));
|
||||
T1l = VSUB(T1j, T1k);
|
||||
T1p = VADD(T1j, T1k);
|
||||
T1m = VADD(T1i, T1l);
|
||||
ST(&(Rp[WS(rs, 2)]), T1m, ms, &(Rp[0]));
|
||||
T1r = VCONJ(VSUB(T1p, T1o));
|
||||
ST(&(Rm[WS(rs, 3)]), T1r, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1n = VCONJ(VSUB(T1l, T1i));
|
||||
ST(&(Rm[WS(rs, 1)]), T1n, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1q = VADD(T1o, T1p);
|
||||
ST(&(Rp[WS(rs, 4)]), T1q, ms, &(Rp[0]));
|
||||
TY = VMUL(LDK(KP433012701), VSUB(TW, TX));
|
||||
T11 = VSUB(TZ, T10);
|
||||
T12 = VADD(TY, T11);
|
||||
T1c = VSUB(T11, TY);
|
||||
T15 = VMUL(LDK(KP866025403), VSUB(T13, T14));
|
||||
T18 = VSUB(T16, T17);
|
||||
T19 = VMUL(LDK(KP500000000), VBYI(VSUB(T15, T18)));
|
||||
T1d = VMUL(LDK(KP500000000), VBYI(VADD(T15, T18)));
|
||||
T1a = VCONJ(VSUB(T12, T19));
|
||||
ST(&(Rm[0]), T1a, -ms, &(Rm[0]));
|
||||
T1f = VCONJ(VADD(T1c, T1d));
|
||||
ST(&(Rm[WS(rs, 4)]), T1f, -ms, &(Rm[0]));
|
||||
T1b = VADD(T12, T19);
|
||||
ST(&(Rp[WS(rs, 1)]), T1b, ms, &(Rp[WS(rs, 1)]));
|
||||
T1e = VSUB(T1c, T1d);
|
||||
ST(&(Rp[WS(rs, 5)]), T1e, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, { 67, 37, 4, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
432
fftw-3.3.10/rdft/simd/common/hc2cfdftv_16.c
Normal file
432
fftw-3.3.10/rdft/simd/common/hc2cfdftv_16.c
Normal file
@@ -0,0 +1,432 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 103 FP additions, 96 FP multiplications,
|
||||
* (or, 53 additions, 46 multiplications, 50 fused multiply/add),
|
||||
* 92 stack variables, 4 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
V T8, TZ, TH, T12, T1q, T1I, T1x, T1J, Tr, T10, T1A, T1K, TS, T13, T1t;
|
||||
V T1N, T3, Tw, TF, TW, T7, Tu, TB, TY, T1, T2, Tv, TD, TE, TC;
|
||||
V TV, T5, T6, T4, Tt, Tz, TA, Ty, TX, Tx, TG, T1o, T1p, T1v, T1w;
|
||||
V T1C, T1D, T1u, T1B, T1G, T1H, T1E, T1F;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
Tv = LDW(&(W[0]));
|
||||
Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
|
||||
TD = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
TC = LDW(&(W[TWVL * 8]));
|
||||
TF = VZMULIJ(TC, VFNMSCONJ(TE, TD));
|
||||
TV = LDW(&(W[TWVL * 6]));
|
||||
TW = VZMULJ(TV, VFMACONJ(TE, TD));
|
||||
T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T4 = LDW(&(W[TWVL * 14]));
|
||||
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
|
||||
Tt = LDW(&(W[TWVL * 16]));
|
||||
Tu = VZMULIJ(Tt, VFNMSCONJ(T6, T5));
|
||||
Tz = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
Ty = LDW(&(W[TWVL * 24]));
|
||||
TB = VZMULIJ(Ty, VFNMSCONJ(TA, Tz));
|
||||
TX = LDW(&(W[TWVL * 22]));
|
||||
TY = VZMULJ(TX, VFMACONJ(TA, Tz));
|
||||
T8 = VSUB(T3, T7);
|
||||
TZ = VSUB(TW, TY);
|
||||
Tx = VSUB(Tu, Tw);
|
||||
TG = VSUB(TB, TF);
|
||||
TH = VFNMS(LDK(KP414213562), TG, Tx);
|
||||
T12 = VFMA(LDK(KP414213562), Tx, TG);
|
||||
T1o = VADD(T3, T7);
|
||||
T1p = VADD(TW, TY);
|
||||
T1q = VADD(T1o, T1p);
|
||||
T1I = VSUB(T1o, T1p);
|
||||
T1v = VADD(Tw, Tu);
|
||||
T1w = VADD(TF, TB);
|
||||
T1x = VADD(T1v, T1w);
|
||||
T1J = VSUB(T1w, T1v);
|
||||
{
|
||||
V Tc, TQ, Tp, TJ, Tg, TO, Tl, TL, Ta, Tb, T9, TP, Tn, To, Tm;
|
||||
V TI, Te, Tf, Td, TN, Tj, Tk, Ti, TK, Th, Tq, T1y, T1z, TM, TR;
|
||||
V T1r, T1s;
|
||||
Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T9 = LDW(&(W[TWVL * 2]));
|
||||
Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
|
||||
TP = LDW(&(W[TWVL * 4]));
|
||||
TQ = VZMULIJ(TP, VFNMSCONJ(Tb, Ta));
|
||||
Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
To = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tm = LDW(&(W[TWVL * 10]));
|
||||
Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
|
||||
TI = LDW(&(W[TWVL * 12]));
|
||||
TJ = VZMULIJ(TI, VFNMSCONJ(To, Tn));
|
||||
Te = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tf = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Td = LDW(&(W[TWVL * 18]));
|
||||
Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
|
||||
TN = LDW(&(W[TWVL * 20]));
|
||||
TO = VZMULIJ(TN, VFNMSCONJ(Tf, Te));
|
||||
Tj = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tk = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = LDW(&(W[TWVL * 26]));
|
||||
Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
|
||||
TK = LDW(&(W[TWVL * 28]));
|
||||
TL = VZMULIJ(TK, VFNMSCONJ(Tk, Tj));
|
||||
Th = VSUB(Tc, Tg);
|
||||
Tq = VSUB(Tl, Tp);
|
||||
Tr = VADD(Th, Tq);
|
||||
T10 = VSUB(Tq, Th);
|
||||
T1y = VADD(TQ, TO);
|
||||
T1z = VADD(TL, TJ);
|
||||
T1A = VADD(T1y, T1z);
|
||||
T1K = VSUB(T1y, T1z);
|
||||
TM = VSUB(TJ, TL);
|
||||
TR = VSUB(TO, TQ);
|
||||
TS = VFMA(LDK(KP414213562), TR, TM);
|
||||
T13 = VFNMS(LDK(KP414213562), TM, TR);
|
||||
T1r = VADD(Tc, Tg);
|
||||
T1s = VADD(Tl, Tp);
|
||||
T1t = VADD(T1r, T1s);
|
||||
T1N = VSUB(T1s, T1r);
|
||||
}
|
||||
T1u = VSUB(T1q, T1t);
|
||||
T1B = VSUB(T1x, T1A);
|
||||
T1C = VMUL(LDK(KP500000000), VFMAI(T1B, T1u));
|
||||
T1D = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1B, T1u)));
|
||||
ST(&(Rp[WS(rs, 4)]), T1C, ms, &(Rp[0]));
|
||||
ST(&(Rm[WS(rs, 3)]), T1D, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1E = VADD(T1q, T1t);
|
||||
T1F = VADD(T1x, T1A);
|
||||
T1G = VMUL(LDK(KP500000000), VSUB(T1E, T1F));
|
||||
T1H = VCONJ(VMUL(LDK(KP500000000), VADD(T1F, T1E)));
|
||||
ST(&(Rp[0]), T1G, ms, &(Rp[0]));
|
||||
ST(&(Rm[WS(rs, 7)]), T1H, -ms, &(Rm[WS(rs, 1)]));
|
||||
{
|
||||
V T1M, T1S, T1P, T1T, T1L, T1O, T1Q, T1V, T1R, T1U, TU, T18, T15, T19, Ts;
|
||||
V TT, T11, T14, T16, T1b, T17, T1a, T1e, T1k, T1h, T1l, T1c, T1d, T1f, T1g;
|
||||
V T1i, T1n, T1j, T1m;
|
||||
T1L = VADD(T1J, T1K);
|
||||
T1M = VFMA(LDK(KP707106781), T1L, T1I);
|
||||
T1S = VFNMS(LDK(KP707106781), T1L, T1I);
|
||||
T1O = VSUB(T1K, T1J);
|
||||
T1P = VFMA(LDK(KP707106781), T1O, T1N);
|
||||
T1T = VFNMS(LDK(KP707106781), T1O, T1N);
|
||||
T1Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1P, T1M)));
|
||||
ST(&(Rm[WS(rs, 1)]), T1Q, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1T, T1S)));
|
||||
ST(&(Rm[WS(rs, 5)]), T1V, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1R = VMUL(LDK(KP500000000), VFMAI(T1P, T1M));
|
||||
ST(&(Rp[WS(rs, 2)]), T1R, ms, &(Rp[0]));
|
||||
T1U = VMUL(LDK(KP500000000), VFNMSI(T1T, T1S));
|
||||
ST(&(Rp[WS(rs, 6)]), T1U, ms, &(Rp[0]));
|
||||
Ts = VFMA(LDK(KP707106781), Tr, T8);
|
||||
TT = VADD(TH, TS);
|
||||
TU = VFMA(LDK(KP923879532), TT, Ts);
|
||||
T18 = VFNMS(LDK(KP923879532), TT, Ts);
|
||||
T11 = VFNMS(LDK(KP707106781), T10, TZ);
|
||||
T14 = VADD(T12, T13);
|
||||
T15 = VFMA(LDK(KP923879532), T14, T11);
|
||||
T19 = VFNMS(LDK(KP923879532), T14, T11);
|
||||
T16 = VMUL(LDK(KP500000000), VFNMSI(T15, TU));
|
||||
ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
|
||||
T1b = VMUL(LDK(KP500000000), VFMAI(T19, T18));
|
||||
ST(&(Rp[WS(rs, 7)]), T1b, ms, &(Rp[WS(rs, 1)]));
|
||||
T17 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T15, TU)));
|
||||
ST(&(Rm[0]), T17, -ms, &(Rm[0]));
|
||||
T1a = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T19, T18)));
|
||||
ST(&(Rm[WS(rs, 6)]), T1a, -ms, &(Rm[0]));
|
||||
T1c = VFNMS(LDK(KP707106781), Tr, T8);
|
||||
T1d = VSUB(T12, T13);
|
||||
T1e = VFMA(LDK(KP923879532), T1d, T1c);
|
||||
T1k = VFNMS(LDK(KP923879532), T1d, T1c);
|
||||
T1f = VFMA(LDK(KP707106781), T10, TZ);
|
||||
T1g = VSUB(TS, TH);
|
||||
T1h = VFMA(LDK(KP923879532), T1g, T1f);
|
||||
T1l = VFNMS(LDK(KP923879532), T1g, T1f);
|
||||
T1i = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1h, T1e)));
|
||||
ST(&(Rm[WS(rs, 2)]), T1i, -ms, &(Rm[0]));
|
||||
T1n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1l, T1k)));
|
||||
ST(&(Rm[WS(rs, 4)]), T1n, -ms, &(Rm[0]));
|
||||
T1j = VMUL(LDK(KP500000000), VFMAI(T1h, T1e));
|
||||
ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
|
||||
T1m = VMUL(LDK(KP500000000), VFNMSI(T1l, T1k));
|
||||
ST(&(Rp[WS(rs, 5)]), T1m, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, { 53, 46, 50, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 103 FP additions, 56 FP multiplications,
|
||||
* (or, 99 additions, 52 multiplications, 4 fused multiply/add),
|
||||
* 101 stack variables, 5 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
V T1D, T1E, T1R, TP, T1b, Ta, T1w, T18, T1x, T1z, T1A, T1G, T1H, T1S, Tx;
|
||||
V T13, T10, T1a, T1, T3, TA, TM, TL, TN, T6, T8, TC, TH, TG, TI;
|
||||
V T2, Tz, TK, TJ, T7, TB, TF, TE, TD, TO, T4, T9, T5, T15, T17;
|
||||
V T14, T16;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
Tz = LDW(&(W[0]));
|
||||
TA = VZMULIJ(Tz, VSUB(T3, T1));
|
||||
TM = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
TL = VCONJ(TK);
|
||||
TJ = LDW(&(W[TWVL * 24]));
|
||||
TN = VZMULIJ(TJ, VSUB(TL, TM));
|
||||
T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
TB = LDW(&(W[TWVL * 16]));
|
||||
TC = VZMULIJ(TB, VSUB(T8, T6));
|
||||
TH = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
TG = VCONJ(TF);
|
||||
TE = LDW(&(W[TWVL * 8]));
|
||||
TI = VZMULIJ(TE, VSUB(TG, TH));
|
||||
T1D = VADD(TA, TC);
|
||||
T1E = VADD(TI, TN);
|
||||
T1R = VSUB(T1D, T1E);
|
||||
TD = VSUB(TA, TC);
|
||||
TO = VSUB(TI, TN);
|
||||
TP = VFNMS(LDK(KP382683432), TO, VMUL(LDK(KP923879532), TD));
|
||||
T1b = VFMA(LDK(KP382683432), TD, VMUL(LDK(KP923879532), TO));
|
||||
T4 = VADD(T1, T3);
|
||||
T5 = LDW(&(W[TWVL * 14]));
|
||||
T9 = VZMULJ(T5, VADD(T6, T8));
|
||||
Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
|
||||
T1w = VADD(T4, T9);
|
||||
T14 = LDW(&(W[TWVL * 6]));
|
||||
T15 = VZMULJ(T14, VADD(TH, TG));
|
||||
T16 = LDW(&(W[TWVL * 22]));
|
||||
T17 = VZMULJ(T16, VADD(TM, TL));
|
||||
T18 = VSUB(T15, T17);
|
||||
T1x = VADD(T15, T17);
|
||||
{
|
||||
V Tf, TR, Tv, TY, Tk, TT, Tq, TW, Tc, Te, Td, Tb, TQ, Ts, Tu;
|
||||
V Tt, Tr, TX, Th, Tj, Ti, Tg, TS, Tn, Tp, To, Tm, TV, Tl, Tw;
|
||||
V TU, TZ;
|
||||
Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Te = VCONJ(Td);
|
||||
Tb = LDW(&(W[TWVL * 2]));
|
||||
Tf = VZMULJ(Tb, VADD(Tc, Te));
|
||||
TQ = LDW(&(W[TWVL * 4]));
|
||||
TR = VZMULIJ(TQ, VSUB(Te, Tc));
|
||||
Ts = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tt = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tu = VCONJ(Tt);
|
||||
Tr = LDW(&(W[TWVL * 10]));
|
||||
Tv = VZMULJ(Tr, VADD(Ts, Tu));
|
||||
TX = LDW(&(W[TWVL * 12]));
|
||||
TY = VZMULIJ(TX, VSUB(Tu, Ts));
|
||||
Th = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Ti = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tj = VCONJ(Ti);
|
||||
Tg = LDW(&(W[TWVL * 18]));
|
||||
Tk = VZMULJ(Tg, VADD(Th, Tj));
|
||||
TS = LDW(&(W[TWVL * 20]));
|
||||
TT = VZMULIJ(TS, VSUB(Tj, Th));
|
||||
Tn = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
To = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tp = VCONJ(To);
|
||||
Tm = LDW(&(W[TWVL * 26]));
|
||||
Tq = VZMULJ(Tm, VADD(Tn, Tp));
|
||||
TV = LDW(&(W[TWVL * 28]));
|
||||
TW = VZMULIJ(TV, VSUB(Tp, Tn));
|
||||
T1z = VADD(Tf, Tk);
|
||||
T1A = VADD(Tq, Tv);
|
||||
T1G = VADD(TR, TT);
|
||||
T1H = VADD(TW, TY);
|
||||
T1S = VSUB(T1H, T1G);
|
||||
Tl = VSUB(Tf, Tk);
|
||||
Tw = VSUB(Tq, Tv);
|
||||
Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
|
||||
T13 = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
|
||||
TU = VSUB(TR, TT);
|
||||
TZ = VSUB(TW, TY);
|
||||
T10 = VFMA(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TZ));
|
||||
T1a = VFNMS(LDK(KP923879532), TU, VMUL(LDK(KP382683432), TZ));
|
||||
}
|
||||
{
|
||||
V T1U, T20, T1X, T21, T1Q, T1T, T1V, T1W, T1Y, T23, T1Z, T22, T1C, T1M, T1J;
|
||||
V T1N, T1y, T1B, T1F, T1I, T1K, T1P, T1L, T1O, T12, T1g, T1d, T1h, Ty, T11;
|
||||
V T19, T1c, T1e, T1j, T1f, T1i, T1m, T1s, T1p, T1t, T1k, T1l, T1n, T1o, T1q;
|
||||
V T1v, T1r, T1u;
|
||||
T1Q = VMUL(LDK(KP500000000), VSUB(T1w, T1x));
|
||||
T1T = VMUL(LDK(KP353553390), VADD(T1R, T1S));
|
||||
T1U = VADD(T1Q, T1T);
|
||||
T20 = VSUB(T1Q, T1T);
|
||||
T1V = VSUB(T1A, T1z);
|
||||
T1W = VMUL(LDK(KP707106781), VSUB(T1S, T1R));
|
||||
T1X = VMUL(LDK(KP500000000), VBYI(VADD(T1V, T1W)));
|
||||
T21 = VMUL(LDK(KP500000000), VBYI(VSUB(T1W, T1V)));
|
||||
T1Y = VCONJ(VSUB(T1U, T1X));
|
||||
ST(&(Rm[WS(rs, 1)]), T1Y, -ms, &(Rm[WS(rs, 1)]));
|
||||
T23 = VADD(T20, T21);
|
||||
ST(&(Rp[WS(rs, 6)]), T23, ms, &(Rp[0]));
|
||||
T1Z = VADD(T1U, T1X);
|
||||
ST(&(Rp[WS(rs, 2)]), T1Z, ms, &(Rp[0]));
|
||||
T22 = VCONJ(VSUB(T20, T21));
|
||||
ST(&(Rm[WS(rs, 5)]), T22, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1y = VADD(T1w, T1x);
|
||||
T1B = VADD(T1z, T1A);
|
||||
T1C = VADD(T1y, T1B);
|
||||
T1M = VSUB(T1y, T1B);
|
||||
T1F = VADD(T1D, T1E);
|
||||
T1I = VADD(T1G, T1H);
|
||||
T1J = VADD(T1F, T1I);
|
||||
T1N = VBYI(VSUB(T1I, T1F));
|
||||
T1K = VCONJ(VMUL(LDK(KP500000000), VSUB(T1C, T1J)));
|
||||
ST(&(Rm[WS(rs, 7)]), T1K, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1P = VMUL(LDK(KP500000000), VADD(T1M, T1N));
|
||||
ST(&(Rp[WS(rs, 4)]), T1P, ms, &(Rp[0]));
|
||||
T1L = VMUL(LDK(KP500000000), VADD(T1C, T1J));
|
||||
ST(&(Rp[0]), T1L, ms, &(Rp[0]));
|
||||
T1O = VCONJ(VMUL(LDK(KP500000000), VSUB(T1M, T1N)));
|
||||
ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
|
||||
Ty = VADD(Ta, Tx);
|
||||
T11 = VMUL(LDK(KP500000000), VADD(TP, T10));
|
||||
T12 = VADD(Ty, T11);
|
||||
T1g = VSUB(Ty, T11);
|
||||
T19 = VSUB(T13, T18);
|
||||
T1c = VSUB(T1a, T1b);
|
||||
T1d = VMUL(LDK(KP500000000), VBYI(VADD(T19, T1c)));
|
||||
T1h = VMUL(LDK(KP500000000), VBYI(VSUB(T1c, T19)));
|
||||
T1e = VCONJ(VSUB(T12, T1d));
|
||||
ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
|
||||
T1j = VADD(T1g, T1h);
|
||||
ST(&(Rp[WS(rs, 7)]), T1j, ms, &(Rp[WS(rs, 1)]));
|
||||
T1f = VADD(T12, T1d);
|
||||
ST(&(Rp[WS(rs, 1)]), T1f, ms, &(Rp[WS(rs, 1)]));
|
||||
T1i = VCONJ(VSUB(T1g, T1h));
|
||||
ST(&(Rm[WS(rs, 6)]), T1i, -ms, &(Rm[0]));
|
||||
T1k = VSUB(T10, TP);
|
||||
T1l = VADD(T18, T13);
|
||||
T1m = VMUL(LDK(KP500000000), VBYI(VSUB(T1k, T1l)));
|
||||
T1s = VMUL(LDK(KP500000000), VBYI(VADD(T1l, T1k)));
|
||||
T1n = VSUB(Ta, Tx);
|
||||
T1o = VMUL(LDK(KP500000000), VADD(T1b, T1a));
|
||||
T1p = VSUB(T1n, T1o);
|
||||
T1t = VADD(T1n, T1o);
|
||||
T1q = VADD(T1m, T1p);
|
||||
ST(&(Rp[WS(rs, 5)]), T1q, ms, &(Rp[WS(rs, 1)]));
|
||||
T1v = VCONJ(VSUB(T1t, T1s));
|
||||
ST(&(Rm[WS(rs, 2)]), T1v, -ms, &(Rm[0]));
|
||||
T1r = VCONJ(VSUB(T1p, T1m));
|
||||
ST(&(Rm[WS(rs, 4)]), T1r, -ms, &(Rm[0]));
|
||||
T1u = VADD(T1s, T1t);
|
||||
ST(&(Rp[WS(rs, 3)]), T1u, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, { 99, 52, 4, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
111
fftw-3.3.10/rdft/simd/common/hc2cfdftv_2.c
Normal file
111
fftw-3.3.10/rdft/simd/common/hc2cfdftv_2.c
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dit -name hc2cfdftv_2 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 5 FP additions, 6 FP multiplications,
|
||||
* (or, 3 additions, 4 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 1 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
V T3, T5, T1, T2, T4, T6, T7;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
T4 = LDW(&(W[0]));
|
||||
T5 = VZMULIJ(T4, VFNMSCONJ(T2, T1));
|
||||
T6 = VMUL(LDK(KP500000000), VSUB(T3, T5));
|
||||
ST(&(Rp[0]), T6, ms, &(Rp[0]));
|
||||
T7 = VCONJ(VMUL(LDK(KP500000000), VADD(T3, T5)));
|
||||
ST(&(Rm[0]), T7, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cfdftv_2"), twinstr, &GENUS, { 3, 4, 2, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 2 -dit -name hc2cfdftv_2 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 5 FP additions, 4 FP multiplications,
|
||||
* (or, 5 additions, 4 multiplications, 0 fused multiply/add),
|
||||
* 10 stack variables, 1 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 2)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
V T4, T6, T1, T3, T2, T5, T7, T8;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
T4 = VADD(T1, T3);
|
||||
T5 = LDW(&(W[0]));
|
||||
T6 = VZMULIJ(T5, VSUB(T3, T1));
|
||||
T7 = VCONJ(VMUL(LDK(KP500000000), VSUB(T4, T6)));
|
||||
ST(&(Rm[0]), T7, -ms, &(Rm[0]));
|
||||
T8 = VMUL(LDK(KP500000000), VADD(T4, T6));
|
||||
ST(&(Rp[0]), T8, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, XSIMD_STRING("hc2cfdftv_2"), twinstr, &GENUS, { 5, 4, 0, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
551
fftw-3.3.10/rdft/simd/common/hc2cfdftv_20.c
Normal file
551
fftw-3.3.10/rdft/simd/common/hc2cfdftv_20.c
Normal file
@@ -0,0 +1,551 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 143 FP additions, 128 FP multiplications,
|
||||
* (or, 77 additions, 62 multiplications, 66 fused multiply/add),
|
||||
* 129 stack variables, 5 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
|
||||
V T1O, T2j, T2c, T2b, T2i, T1X, Tx, TM, TN, T1x, T1y, T1z, T1u, T1v, T1w;
|
||||
V T12, T1d, T1e, T24, T2g, Ti, T1t, T1V, T29, T26, T27, T1W, T25, T1H, T1L;
|
||||
V T1B, T1K, T1E, T1F, T1G, T1D, T1A, T1C, T1N, T1I, T1J, T1M;
|
||||
{
|
||||
V T3, T1Y, TC, T7, Tn, T1P, Tc, Tg, Tw, T1Z, TS, T1S, TL, T21, T17;
|
||||
V T1Q, T11, T22, T1c, T1T, T1, T2, Tz, T5, T6, TB, Ty, TA, T4, Ta;
|
||||
V Tb, Tk, Te, Tf, Tm, Tj, Tl, T9, Td, T20, T23, T8, Th, T1R, T1U;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
Ty = LDW(&(W[0]));
|
||||
Tz = VZMULIJ(Ty, VFNMSCONJ(T2, T1));
|
||||
T5 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TA = LDW(&(W[TWVL * 20]));
|
||||
TB = VZMULIJ(TA, VFNMSCONJ(T6, T5));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
T1Y = VSUB(TB, Tz);
|
||||
TC = VADD(Tz, TB);
|
||||
T4 = LDW(&(W[TWVL * 18]));
|
||||
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
|
||||
Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tj = LDW(&(W[TWVL * 6]));
|
||||
Tk = VZMULJ(Tj, VFMACONJ(Tb, Ta));
|
||||
Te = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tf = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tl = LDW(&(W[TWVL * 26]));
|
||||
Tm = VZMULJ(Tl, VFMACONJ(Tf, Te));
|
||||
Tn = VADD(Tk, Tm);
|
||||
T1P = VSUB(Tk, Tm);
|
||||
T9 = LDW(&(W[TWVL * 8]));
|
||||
Tc = VZMULIJ(T9, VFNMSCONJ(Tb, Ta));
|
||||
Td = LDW(&(W[TWVL * 28]));
|
||||
Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
|
||||
{
|
||||
V Tr, TP, Tv, TR, Tp, Tq, To, TO, Tt, Tu, Ts, TQ, TG, T14, TK;
|
||||
V T16, TE, TF, TD, T13, TI, TJ, TH, T15, TW, T19, T10, T1b, TU, TV;
|
||||
V TT, T18, TY, TZ, TX, T1a;
|
||||
Tp = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Tq = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
To = LDW(&(W[TWVL * 16]));
|
||||
Tr = VZMULIJ(To, VFNMSCONJ(Tq, Tp));
|
||||
TO = LDW(&(W[TWVL * 14]));
|
||||
TP = VZMULJ(TO, VFMACONJ(Tq, Tp));
|
||||
Tt = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tu = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ts = LDW(&(W[TWVL * 36]));
|
||||
Tv = VZMULIJ(Ts, VFNMSCONJ(Tu, Tt));
|
||||
TQ = LDW(&(W[TWVL * 34]));
|
||||
TR = VZMULJ(TQ, VFMACONJ(Tu, Tt));
|
||||
Tw = VADD(Tr, Tv);
|
||||
T1Z = VSUB(Tv, Tr);
|
||||
TS = VADD(TP, TR);
|
||||
T1S = VSUB(TP, TR);
|
||||
TE = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
TF = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
TD = LDW(&(W[TWVL * 30]));
|
||||
TG = VZMULJ(TD, VFMACONJ(TF, TE));
|
||||
T13 = LDW(&(W[TWVL * 32]));
|
||||
T14 = VZMULIJ(T13, VFNMSCONJ(TF, TE));
|
||||
TI = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TJ = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TH = LDW(&(W[TWVL * 10]));
|
||||
TK = VZMULJ(TH, VFMACONJ(TJ, TI));
|
||||
T15 = LDW(&(W[TWVL * 12]));
|
||||
T16 = VZMULIJ(T15, VFNMSCONJ(TJ, TI));
|
||||
TL = VADD(TG, TK);
|
||||
T21 = VSUB(T16, T14);
|
||||
T17 = VADD(T14, T16);
|
||||
T1Q = VSUB(TK, TG);
|
||||
TU = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
TV = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
TT = LDW(&(W[TWVL * 24]));
|
||||
TW = VZMULIJ(TT, VFNMSCONJ(TV, TU));
|
||||
T18 = LDW(&(W[TWVL * 22]));
|
||||
T19 = VZMULJ(T18, VFMACONJ(TV, TU));
|
||||
TY = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TZ = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TX = LDW(&(W[TWVL * 4]));
|
||||
T10 = VZMULIJ(TX, VFNMSCONJ(TZ, TY));
|
||||
T1a = LDW(&(W[TWVL * 2]));
|
||||
T1b = VZMULJ(T1a, VFMACONJ(TZ, TY));
|
||||
T11 = VADD(TW, T10);
|
||||
T22 = VSUB(T10, TW);
|
||||
T1c = VADD(T19, T1b);
|
||||
T1T = VSUB(T1b, T19);
|
||||
}
|
||||
T1O = VSUB(T3, T7);
|
||||
T2j = VADD(T1S, T1T);
|
||||
T2c = VSUB(T21, T22);
|
||||
T2b = VSUB(T1Y, T1Z);
|
||||
T2i = VADD(T1P, T1Q);
|
||||
T1X = VSUB(Tg, Tc);
|
||||
Tx = VSUB(Tn, Tw);
|
||||
TM = VSUB(TC, TL);
|
||||
TN = VSUB(Tx, TM);
|
||||
T1x = VADD(TS, T11);
|
||||
T1y = VADD(T17, T1c);
|
||||
T1z = VADD(T1x, T1y);
|
||||
T1u = VADD(Tn, Tw);
|
||||
T1v = VADD(TC, TL);
|
||||
T1w = VADD(T1u, T1v);
|
||||
T12 = VSUB(TS, T11);
|
||||
T1d = VSUB(T17, T1c);
|
||||
T1e = VSUB(T12, T1d);
|
||||
T20 = VADD(T1Y, T1Z);
|
||||
T23 = VADD(T21, T22);
|
||||
T24 = VADD(T20, T23);
|
||||
T2g = VSUB(T23, T20);
|
||||
T8 = VADD(T3, T7);
|
||||
Th = VADD(Tc, Tg);
|
||||
Ti = VSUB(T8, Th);
|
||||
T1t = VADD(T8, Th);
|
||||
T1R = VSUB(T1P, T1Q);
|
||||
T1U = VSUB(T1S, T1T);
|
||||
T1V = VADD(T1R, T1U);
|
||||
T29 = VSUB(T1R, T1U);
|
||||
}
|
||||
T1W = VADD(T1O, T1V);
|
||||
T25 = VADD(T1X, T24);
|
||||
T26 = VMUL(LDK(KP500000000), VFNMSI(T25, T1W));
|
||||
T27 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T25, T1W)));
|
||||
ST(&(Rp[WS(rs, 5)]), T26, ms, &(Rp[WS(rs, 1)]));
|
||||
ST(&(Rm[WS(rs, 4)]), T27, -ms, &(Rm[0]));
|
||||
T1F = VSUB(T1x, T1y);
|
||||
T1G = VSUB(T1u, T1v);
|
||||
T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
|
||||
T1L = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
|
||||
T1D = VSUB(T1w, T1z);
|
||||
T1A = VADD(T1w, T1z);
|
||||
T1C = VFNMS(LDK(KP250000000), T1A, T1t);
|
||||
T1B = VCONJ(VMUL(LDK(KP500000000), VADD(T1t, T1A)));
|
||||
T1K = VFMA(LDK(KP559016994), T1D, T1C);
|
||||
T1E = VFNMS(LDK(KP559016994), T1D, T1C);
|
||||
ST(&(Rm[WS(rs, 9)]), T1B, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1N = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1L, T1K)));
|
||||
ST(&(Rm[WS(rs, 5)]), T1N, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1I = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1H, T1E)));
|
||||
ST(&(Rm[WS(rs, 1)]), T1I, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1J = VMUL(LDK(KP500000000), VFMAI(T1H, T1E));
|
||||
ST(&(Rp[WS(rs, 2)]), T1J, ms, &(Rp[0]));
|
||||
T1M = VMUL(LDK(KP500000000), VFNMSI(T1L, T1K));
|
||||
ST(&(Rp[WS(rs, 6)]), T1M, ms, &(Rp[0]));
|
||||
{
|
||||
V T1m, T1q, T1g, T1p, T1j, T1k, T1l, T1i, T1f, T1h, T1s, T1n, T1o, T1r, T2e;
|
||||
V T2A, T2o, T2u, T2l, T2B, T2p, T2x, T2d, T2t, T2a, T2s, T28, T2k, T2w, T2h;
|
||||
V T2v, T2f, T2m, T2C, T2D, T2n, T2q, T2y, T2z, T2r;
|
||||
T1k = VADD(Tx, TM);
|
||||
T1l = VADD(T12, T1d);
|
||||
T1m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1l, T1k));
|
||||
T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1k, T1l));
|
||||
T1i = VSUB(TN, T1e);
|
||||
T1f = VADD(TN, T1e);
|
||||
T1h = VFNMS(LDK(KP250000000), T1f, Ti);
|
||||
T1g = VMUL(LDK(KP500000000), VADD(Ti, T1f));
|
||||
T1p = VFNMS(LDK(KP559016994), T1i, T1h);
|
||||
T1j = VFMA(LDK(KP559016994), T1i, T1h);
|
||||
ST(&(Rp[0]), T1g, ms, &(Rp[0]));
|
||||
T1s = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1q, T1p)));
|
||||
ST(&(Rm[WS(rs, 7)]), T1s, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1n = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1m, T1j)));
|
||||
ST(&(Rm[WS(rs, 3)]), T1n, -ms, &(Rm[WS(rs, 1)]));
|
||||
T1o = VMUL(LDK(KP500000000), VFMAI(T1m, T1j));
|
||||
ST(&(Rp[WS(rs, 4)]), T1o, ms, &(Rp[0]));
|
||||
T1r = VMUL(LDK(KP500000000), VFNMSI(T1q, T1p));
|
||||
ST(&(Rp[WS(rs, 8)]), T1r, ms, &(Rp[0]));
|
||||
T2d = VFMA(LDK(KP618033988), T2c, T2b);
|
||||
T2t = VFNMS(LDK(KP618033988), T2b, T2c);
|
||||
T28 = VFNMS(LDK(KP250000000), T1V, T1O);
|
||||
T2a = VFMA(LDK(KP559016994), T29, T28);
|
||||
T2s = VFNMS(LDK(KP559016994), T29, T28);
|
||||
T2e = VFNMS(LDK(KP951056516), T2d, T2a);
|
||||
T2A = VFMA(LDK(KP951056516), T2t, T2s);
|
||||
T2o = VFMA(LDK(KP951056516), T2d, T2a);
|
||||
T2u = VFNMS(LDK(KP951056516), T2t, T2s);
|
||||
T2k = VFMA(LDK(KP618033988), T2j, T2i);
|
||||
T2w = VFNMS(LDK(KP618033988), T2i, T2j);
|
||||
T2f = VFNMS(LDK(KP250000000), T24, T1X);
|
||||
T2h = VFNMS(LDK(KP559016994), T2g, T2f);
|
||||
T2v = VFMA(LDK(KP559016994), T2g, T2f);
|
||||
T2l = VFNMS(LDK(KP951056516), T2k, T2h);
|
||||
T2B = VFMA(LDK(KP951056516), T2w, T2v);
|
||||
T2p = VFMA(LDK(KP951056516), T2k, T2h);
|
||||
T2x = VFNMS(LDK(KP951056516), T2w, T2v);
|
||||
T2m = VMUL(LDK(KP500000000), VFNMSI(T2l, T2e));
|
||||
ST(&(Rp[WS(rs, 9)]), T2m, ms, &(Rp[WS(rs, 1)]));
|
||||
T2C = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2B, T2A)));
|
||||
ST(&(Rm[WS(rs, 6)]), T2C, -ms, &(Rm[0]));
|
||||
T2D = VMUL(LDK(KP500000000), VFMAI(T2B, T2A));
|
||||
ST(&(Rp[WS(rs, 7)]), T2D, ms, &(Rp[WS(rs, 1)]));
|
||||
T2n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2l, T2e)));
|
||||
ST(&(Rm[WS(rs, 8)]), T2n, -ms, &(Rm[0]));
|
||||
T2q = VMUL(LDK(KP500000000), VFNMSI(T2p, T2o));
|
||||
ST(&(Rp[WS(rs, 1)]), T2q, ms, &(Rp[WS(rs, 1)]));
|
||||
T2y = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2x, T2u)));
|
||||
ST(&(Rm[WS(rs, 2)]), T2y, -ms, &(Rm[0]));
|
||||
T2z = VMUL(LDK(KP500000000), VFMAI(T2x, T2u));
|
||||
ST(&(Rp[WS(rs, 3)]), T2z, ms, &(Rp[WS(rs, 1)]));
|
||||
T2r = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2p, T2o)));
|
||||
ST(&(Rm[0]), T2r, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, { 77, 62, 66, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 143 FP additions, 77 FP multiplications,
|
||||
* (or, 131 additions, 65 multiplications, 12 fused multiply/add),
|
||||
* 141 stack variables, 9 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
|
||||
DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
|
||||
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
|
||||
DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
|
||||
DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
|
||||
V TW, T1x, T2i, T2A, T1r, T1s, T1a, T1y, T1l, Tn, TK, TL, T1p, T1o, T27;
|
||||
V T2t, T2a, T2u, T2e, T2C, T20, T2w, T23, T2x, T2d, T2B, T1W, T1X, T1U, T1V;
|
||||
V T2z, T2K, T2G, T2N, T2J, T2v, T2y, T2F, T2D, T2E, T2M, T2H, T2I, T2L;
|
||||
{
|
||||
V T1u, T5, Tg, T1c, TV, T13, Ta, T1w, TQ, T11, TI, T1j, Tx, T18, Tl;
|
||||
V T1e, TD, T1h, Ts, T16, T2g, T2h, T14, T19, T1f, T1k, Tb, Tm, Ty, TJ;
|
||||
V T25, T26, T28, T29, T1Y, T1Z, T21, T22;
|
||||
{
|
||||
V T4, T3, T2, T1, Tf, Te, Td, Tc, T1b, TU, TT, TS, TR, T12, T9;
|
||||
V T8, T7, T6, T1v, TP, TO, TN, TM, T10, TH, TG, TF, TE, T1i, Tw;
|
||||
V Tv, Tu, Tt, T17, Tk, Tj, Ti, Th, T1d, TC, TB, TA, Tz, T1g, Tr;
|
||||
V Tq, Tp, To, T15;
|
||||
T4 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
T1u = VADD(T4, T3);
|
||||
T1 = LDW(&(W[0]));
|
||||
T5 = VZMULIJ(T1, VSUB(T3, T4));
|
||||
Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
Td = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
Te = VCONJ(Td);
|
||||
Tc = LDW(&(W[TWVL * 16]));
|
||||
Tg = VZMULIJ(Tc, VSUB(Te, Tf));
|
||||
T1b = LDW(&(W[TWVL * 14]));
|
||||
T1c = VZMULJ(T1b, VADD(Te, Tf));
|
||||
TU = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TS = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TT = VCONJ(TS);
|
||||
TR = LDW(&(W[TWVL * 28]));
|
||||
TV = VZMULIJ(TR, VSUB(TT, TU));
|
||||
T12 = LDW(&(W[TWVL * 26]));
|
||||
T13 = VZMULJ(T12, VADD(TT, TU));
|
||||
T9 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T8 = VCONJ(T7);
|
||||
T6 = LDW(&(W[TWVL * 20]));
|
||||
Ta = VZMULIJ(T6, VSUB(T8, T9));
|
||||
T1v = LDW(&(W[TWVL * 18]));
|
||||
T1w = VZMULJ(T1v, VADD(T9, T8));
|
||||
TP = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
TN = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
TO = VCONJ(TN);
|
||||
TM = LDW(&(W[TWVL * 8]));
|
||||
TQ = VZMULIJ(TM, VSUB(TO, TP));
|
||||
T10 = LDW(&(W[TWVL * 6]));
|
||||
T11 = VZMULJ(T10, VADD(TO, TP));
|
||||
TH = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TF = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TG = VCONJ(TF);
|
||||
TE = LDW(&(W[TWVL * 4]));
|
||||
TI = VZMULIJ(TE, VSUB(TG, TH));
|
||||
T1i = LDW(&(W[TWVL * 2]));
|
||||
T1j = VZMULJ(T1i, VADD(TG, TH));
|
||||
Tw = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tu = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tv = VCONJ(Tu);
|
||||
Tt = LDW(&(W[TWVL * 12]));
|
||||
Tx = VZMULIJ(Tt, VSUB(Tv, Tw));
|
||||
T17 = LDW(&(W[TWVL * 10]));
|
||||
T18 = VZMULJ(T17, VADD(Tw, Tv));
|
||||
Tk = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Ti = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tj = VCONJ(Ti);
|
||||
Th = LDW(&(W[TWVL * 36]));
|
||||
Tl = VZMULIJ(Th, VSUB(Tj, Tk));
|
||||
T1d = LDW(&(W[TWVL * 34]));
|
||||
T1e = VZMULJ(T1d, VADD(Tj, Tk));
|
||||
TC = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
TB = VCONJ(TA);
|
||||
Tz = LDW(&(W[TWVL * 24]));
|
||||
TD = VZMULIJ(Tz, VSUB(TB, TC));
|
||||
T1g = LDW(&(W[TWVL * 22]));
|
||||
T1h = VZMULJ(T1g, VADD(TB, TC));
|
||||
Tr = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
Tp = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
Tq = VCONJ(Tp);
|
||||
To = LDW(&(W[TWVL * 32]));
|
||||
Ts = VZMULIJ(To, VSUB(Tq, Tr));
|
||||
T15 = LDW(&(W[TWVL * 30]));
|
||||
T16 = VZMULJ(T15, VADD(Tr, Tq));
|
||||
}
|
||||
TW = VSUB(TQ, TV);
|
||||
T1x = VSUB(T1u, T1w);
|
||||
T2g = VADD(T1u, T1w);
|
||||
T2h = VADD(TQ, TV);
|
||||
T2i = VADD(T2g, T2h);
|
||||
T2A = VSUB(T2g, T2h);
|
||||
T14 = VSUB(T11, T13);
|
||||
T19 = VSUB(T16, T18);
|
||||
T1r = VADD(T14, T19);
|
||||
T1f = VSUB(T1c, T1e);
|
||||
T1k = VSUB(T1h, T1j);
|
||||
T1s = VADD(T1f, T1k);
|
||||
T1a = VSUB(T14, T19);
|
||||
T1y = VADD(T1r, T1s);
|
||||
T1l = VSUB(T1f, T1k);
|
||||
Tb = VSUB(T5, Ta);
|
||||
Tm = VSUB(Tg, Tl);
|
||||
Tn = VADD(Tb, Tm);
|
||||
Ty = VSUB(Ts, Tx);
|
||||
TJ = VSUB(TD, TI);
|
||||
TK = VADD(Ty, TJ);
|
||||
TL = VADD(Tn, TK);
|
||||
T1p = VSUB(Ty, TJ);
|
||||
T1o = VSUB(Tb, Tm);
|
||||
T25 = VADD(T1c, T1e);
|
||||
T26 = VADD(TD, TI);
|
||||
T27 = VADD(T25, T26);
|
||||
T2t = VSUB(T25, T26);
|
||||
T28 = VADD(Ts, Tx);
|
||||
T29 = VADD(T1h, T1j);
|
||||
T2a = VADD(T28, T29);
|
||||
T2u = VSUB(T29, T28);
|
||||
T2e = VADD(T27, T2a);
|
||||
T2C = VADD(T2t, T2u);
|
||||
T1Y = VADD(T11, T13);
|
||||
T1Z = VADD(Tg, Tl);
|
||||
T20 = VADD(T1Y, T1Z);
|
||||
T2w = VSUB(T1Y, T1Z);
|
||||
T21 = VADD(T5, Ta);
|
||||
T22 = VADD(T16, T18);
|
||||
T23 = VADD(T21, T22);
|
||||
T2x = VSUB(T22, T21);
|
||||
T2d = VADD(T20, T23);
|
||||
T2B = VADD(T2w, T2x);
|
||||
}
|
||||
T1U = VADD(T1x, T1y);
|
||||
T1V = VBYI(VADD(TW, TL));
|
||||
T1W = VMUL(LDK(KP500000000), VSUB(T1U, T1V));
|
||||
T1X = VCONJ(VMUL(LDK(KP500000000), VADD(T1V, T1U)));
|
||||
ST(&(Rp[WS(rs, 5)]), T1W, ms, &(Rp[WS(rs, 1)]));
|
||||
ST(&(Rm[WS(rs, 4)]), T1X, -ms, &(Rm[0]));
|
||||
T2v = VSUB(T2t, T2u);
|
||||
T2y = VSUB(T2w, T2x);
|
||||
T2z = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T2y, VMUL(LDK(KP951056516), T2v))));
|
||||
T2K = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T2y, VMUL(LDK(KP587785252), T2v))));
|
||||
T2F = VMUL(LDK(KP279508497), VSUB(T2B, T2C));
|
||||
T2D = VADD(T2B, T2C);
|
||||
T2E = VFNMS(LDK(KP125000000), T2D, VMUL(LDK(KP500000000), T2A));
|
||||
T2G = VSUB(T2E, T2F);
|
||||
T2N = VCONJ(VMUL(LDK(KP500000000), VADD(T2A, T2D)));
|
||||
T2J = VADD(T2F, T2E);
|
||||
ST(&(Rm[WS(rs, 9)]), T2N, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2M = VCONJ(VADD(T2K, T2J));
|
||||
ST(&(Rm[WS(rs, 5)]), T2M, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2H = VADD(T2z, T2G);
|
||||
ST(&(Rp[WS(rs, 2)]), T2H, ms, &(Rp[0]));
|
||||
T2I = VCONJ(VSUB(T2G, T2z));
|
||||
ST(&(Rm[WS(rs, 1)]), T2I, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2L = VSUB(T2J, T2K);
|
||||
ST(&(Rp[WS(rs, 6)]), T2L, ms, &(Rp[0]));
|
||||
{
|
||||
V T2c, T2p, T2l, T2s, T2o, T24, T2b, T2f, T2j, T2k, T2r, T2m, T2n, T2q, T1n;
|
||||
V T1Q, T1E, T1K, T1B, T1R, T1F, T1N, T1m, T1J, TZ, T1I, TX, TY, T1q, T1M;
|
||||
V T1A, T1L, T1t, T1z, T1C, T1S, T1T, T1D, T1G, T1O, T1P, T1H;
|
||||
T24 = VSUB(T20, T23);
|
||||
T2b = VSUB(T27, T2a);
|
||||
T2c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T24, VMUL(LDK(KP587785252), T2b))));
|
||||
T2p = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T24, VMUL(LDK(KP951056516), T2b))));
|
||||
T2f = VMUL(LDK(KP279508497), VSUB(T2d, T2e));
|
||||
T2j = VADD(T2d, T2e);
|
||||
T2k = VFNMS(LDK(KP125000000), T2j, VMUL(LDK(KP500000000), T2i));
|
||||
T2l = VADD(T2f, T2k);
|
||||
T2s = VMUL(LDK(KP500000000), VADD(T2i, T2j));
|
||||
T2o = VSUB(T2k, T2f);
|
||||
ST(&(Rp[0]), T2s, ms, &(Rp[0]));
|
||||
T2r = VCONJ(VADD(T2p, T2o));
|
||||
ST(&(Rm[WS(rs, 7)]), T2r, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2m = VADD(T2c, T2l);
|
||||
ST(&(Rp[WS(rs, 4)]), T2m, ms, &(Rp[0]));
|
||||
T2n = VCONJ(VSUB(T2l, T2c));
|
||||
ST(&(Rm[WS(rs, 3)]), T2n, -ms, &(Rm[WS(rs, 1)]));
|
||||
T2q = VSUB(T2o, T2p);
|
||||
ST(&(Rp[WS(rs, 8)]), T2q, ms, &(Rp[0]));
|
||||
T1m = VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1l));
|
||||
T1J = VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1l));
|
||||
TX = VFMS(LDK(KP250000000), TL, TW);
|
||||
TY = VMUL(LDK(KP559016994), VSUB(TK, Tn));
|
||||
TZ = VADD(TX, TY);
|
||||
T1I = VSUB(TY, TX);
|
||||
T1n = VMUL(LDK(KP500000000), VBYI(VSUB(TZ, T1m)));
|
||||
T1Q = VMUL(LDK(KP500000000), VBYI(VADD(T1I, T1J)));
|
||||
T1E = VMUL(LDK(KP500000000), VBYI(VADD(TZ, T1m)));
|
||||
T1K = VMUL(LDK(KP500000000), VBYI(VSUB(T1I, T1J)));
|
||||
T1q = VFMA(LDK(KP475528258), T1o, VMUL(LDK(KP293892626), T1p));
|
||||
T1M = VFNMS(LDK(KP293892626), T1o, VMUL(LDK(KP475528258), T1p));
|
||||
T1t = VMUL(LDK(KP279508497), VSUB(T1r, T1s));
|
||||
T1z = VFNMS(LDK(KP125000000), T1y, VMUL(LDK(KP500000000), T1x));
|
||||
T1A = VADD(T1t, T1z);
|
||||
T1L = VSUB(T1z, T1t);
|
||||
T1B = VADD(T1q, T1A);
|
||||
T1R = VADD(T1M, T1L);
|
||||
T1F = VSUB(T1A, T1q);
|
||||
T1N = VSUB(T1L, T1M);
|
||||
T1C = VADD(T1n, T1B);
|
||||
ST(&(Rp[WS(rs, 1)]), T1C, ms, &(Rp[WS(rs, 1)]));
|
||||
T1S = VADD(T1Q, T1R);
|
||||
ST(&(Rp[WS(rs, 7)]), T1S, ms, &(Rp[WS(rs, 1)]));
|
||||
T1T = VCONJ(VSUB(T1R, T1Q));
|
||||
ST(&(Rm[WS(rs, 6)]), T1T, -ms, &(Rm[0]));
|
||||
T1D = VCONJ(VSUB(T1B, T1n));
|
||||
ST(&(Rm[0]), T1D, -ms, &(Rm[0]));
|
||||
T1G = VADD(T1E, T1F);
|
||||
ST(&(Rp[WS(rs, 9)]), T1G, ms, &(Rp[WS(rs, 1)]));
|
||||
T1O = VADD(T1K, T1N);
|
||||
ST(&(Rp[WS(rs, 3)]), T1O, ms, &(Rp[WS(rs, 1)]));
|
||||
T1P = VCONJ(VSUB(T1N, T1K));
|
||||
ST(&(Rm[WS(rs, 2)]), T1P, -ms, &(Rm[0]));
|
||||
T1H = VCONJ(VSUB(T1F, T1E));
|
||||
ST(&(Rm[WS(rs, 8)]), T1H, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, { 131, 65, 12, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
878
fftw-3.3.10/rdft/simd/common/hc2cfdftv_32.c
Normal file
878
fftw-3.3.10/rdft/simd/common/hc2cfdftv_32.c
Normal file
@@ -0,0 +1,878 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 249 FP additions, 224 FP multiplications,
|
||||
* (or, 119 additions, 94 multiplications, 130 fused multiply/add),
|
||||
* 154 stack variables, 8 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
|
||||
DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
|
||||
V T47, T48, T4l, T3w, T3F, T3B, T41, Ts, T2y, T1Q, T2B, T27, T2J, T3a, T40;
|
||||
V T1X, T2C, T43, T44, T4a, T4b, T4m, T3p, T3E, T15, T2K, T1u, T2F, T3h, T3C;
|
||||
V T1n, T2E, T2a, T2z, T1a, T18, TU, T3m, T3f, T1r, T1p, T13, T3n, T3e, TB;
|
||||
V T3k, T1l, T3c, TK, T3j, T1g, T3b, T3l, T3o, TL, T14, T1s, T1t, T3d, T3g;
|
||||
V T1b, T1m, T28, T29, T3Q, T3W, T3T, T3X, T3O, T3P, T3R, T3S, T3U, T3Z, T3V;
|
||||
V T3Y;
|
||||
{
|
||||
V T1U, T1S, T3, T3u, T7, T1z, T1D, T3t, T24, T22, Tc, Tg, Th, T3q, T1J;
|
||||
V Tl, Tp, Tq, T3r, T1O, T3s, T3v, T3z, T3A, T8, Tr, T1E, T1P, T25, T26;
|
||||
V T38, T39, T1V, T1W;
|
||||
{
|
||||
V T1, T2, T5, T6, T1T, T1R, T4, T1x, T1y, T1B, T1C, T1w, T1A, T23, T21;
|
||||
V T1I, T1G, Ta, Tb, T9, T1H, Te, Tf, Td, T1F, T1N, T1L, Tj, Tk, Ti;
|
||||
V T1M, Tn, To, Tm, T1K;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T1T = LDW(&(W[0]));
|
||||
T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
|
||||
T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
T1R = LDW(&(W[TWVL * 32]));
|
||||
T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
T3u = VADD(T1U, T1S);
|
||||
T4 = LDW(&(W[TWVL * 30]));
|
||||
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
|
||||
T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
|
||||
T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
|
||||
T1w = LDW(&(W[TWVL * 48]));
|
||||
T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
|
||||
T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T1A = LDW(&(W[TWVL * 16]));
|
||||
T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
|
||||
T3t = VADD(T1D, T1z);
|
||||
T23 = LDW(&(W[TWVL * 46]));
|
||||
T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
|
||||
T21 = LDW(&(W[TWVL * 14]));
|
||||
T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
|
||||
Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T9 = LDW(&(W[TWVL * 6]));
|
||||
Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
|
||||
T1H = LDW(&(W[TWVL * 8]));
|
||||
T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
|
||||
Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
|
||||
Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
|
||||
Td = LDW(&(W[TWVL * 38]));
|
||||
Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
|
||||
T1F = LDW(&(W[TWVL * 40]));
|
||||
T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
|
||||
Th = VSUB(Tc, Tg);
|
||||
T3q = VADD(T1I, T1G);
|
||||
T1J = VSUB(T1G, T1I);
|
||||
Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
|
||||
Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
|
||||
Ti = LDW(&(W[TWVL * 54]));
|
||||
Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
|
||||
T1M = LDW(&(W[TWVL * 56]));
|
||||
T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
|
||||
Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
Tm = LDW(&(W[TWVL * 22]));
|
||||
Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
|
||||
T1K = LDW(&(W[TWVL * 24]));
|
||||
T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
|
||||
Tq = VSUB(Tl, Tp);
|
||||
T3r = VADD(T1N, T1L);
|
||||
T1O = VSUB(T1L, T1N);
|
||||
}
|
||||
T47 = VADD(T3u, T3t);
|
||||
T48 = VADD(T3q, T3r);
|
||||
T4l = VSUB(T48, T47);
|
||||
T3s = VSUB(T3q, T3r);
|
||||
T3v = VSUB(T3t, T3u);
|
||||
T3w = VFNMS(LDK(KP414213562), T3v, T3s);
|
||||
T3F = VFMA(LDK(KP414213562), T3s, T3v);
|
||||
T3z = VADD(Tl, Tp);
|
||||
T3A = VADD(Tc, Tg);
|
||||
T3B = VSUB(T3z, T3A);
|
||||
T41 = VADD(T3A, T3z);
|
||||
T8 = VSUB(T3, T7);
|
||||
Tr = VADD(Th, Tq);
|
||||
Ts = VFNMS(LDK(KP707106781), Tr, T8);
|
||||
T2y = VFMA(LDK(KP707106781), Tr, T8);
|
||||
T1E = VSUB(T1z, T1D);
|
||||
T1P = VSUB(T1J, T1O);
|
||||
T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
|
||||
T2B = VFMA(LDK(KP707106781), T1P, T1E);
|
||||
T25 = VSUB(T22, T24);
|
||||
T26 = VSUB(Tq, Th);
|
||||
T27 = VFMA(LDK(KP707106781), T26, T25);
|
||||
T2J = VFNMS(LDK(KP707106781), T26, T25);
|
||||
T38 = VADD(T3, T7);
|
||||
T39 = VADD(T22, T24);
|
||||
T3a = VSUB(T38, T39);
|
||||
T40 = VADD(T38, T39);
|
||||
T1V = VSUB(T1S, T1U);
|
||||
T1W = VADD(T1J, T1O);
|
||||
T1X = VFNMS(LDK(KP707106781), T1W, T1V);
|
||||
T2C = VFMA(LDK(KP707106781), T1W, T1V);
|
||||
}
|
||||
{
|
||||
V TP, TT, TN, TO, TM, T19, TR, TS, TQ, T17, TY, T12, TW, TX, TV;
|
||||
V T1q, T10, T11, TZ, T1o, Tw, T1i, TA, T1k, Tu, Tv, Tt, T1h, Ty, Tz;
|
||||
V Tx, T1j, TF, T1f, TJ, T1d, TD, TE, TC, T1e, TH, TI, TG, T1c;
|
||||
TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TM = LDW(&(W[TWVL * 10]));
|
||||
TP = VZMULJ(TM, VFMACONJ(TO, TN));
|
||||
T19 = LDW(&(W[TWVL * 12]));
|
||||
T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
|
||||
TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TQ = LDW(&(W[TWVL * 42]));
|
||||
TT = VZMULJ(TQ, VFMACONJ(TS, TR));
|
||||
T17 = LDW(&(W[TWVL * 44]));
|
||||
T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
|
||||
TU = VSUB(TP, TT);
|
||||
T3m = VADD(T1a, T18);
|
||||
T3f = VADD(TP, TT);
|
||||
TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TV = LDW(&(W[TWVL * 58]));
|
||||
TY = VZMULJ(TV, VFMACONJ(TX, TW));
|
||||
T1q = LDW(&(W[TWVL * 60]));
|
||||
T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
|
||||
T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TZ = LDW(&(W[TWVL * 26]));
|
||||
T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
|
||||
T1o = LDW(&(W[TWVL * 28]));
|
||||
T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
|
||||
T13 = VSUB(TY, T12);
|
||||
T3n = VADD(T1r, T1p);
|
||||
T3e = VADD(TY, T12);
|
||||
Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tt = LDW(&(W[TWVL * 18]));
|
||||
Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
|
||||
T1h = LDW(&(W[TWVL * 20]));
|
||||
T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
|
||||
Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tx = LDW(&(W[TWVL * 50]));
|
||||
TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
|
||||
T1j = LDW(&(W[TWVL * 52]));
|
||||
T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
|
||||
TB = VSUB(Tw, TA);
|
||||
T3k = VADD(T1i, T1k);
|
||||
T1l = VSUB(T1i, T1k);
|
||||
T3c = VADD(Tw, TA);
|
||||
TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TC = LDW(&(W[TWVL * 2]));
|
||||
TF = VZMULJ(TC, VFMACONJ(TE, TD));
|
||||
T1e = LDW(&(W[TWVL * 4]));
|
||||
T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
|
||||
TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TG = LDW(&(W[TWVL * 34]));
|
||||
TJ = VZMULJ(TG, VFMACONJ(TI, TH));
|
||||
T1c = LDW(&(W[TWVL * 36]));
|
||||
T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
|
||||
TK = VSUB(TF, TJ);
|
||||
T3j = VADD(T1f, T1d);
|
||||
T1g = VSUB(T1d, T1f);
|
||||
T3b = VADD(TF, TJ);
|
||||
}
|
||||
T43 = VADD(T3b, T3c);
|
||||
T44 = VADD(T3e, T3f);
|
||||
T4a = VADD(T3j, T3k);
|
||||
T4b = VADD(T3n, T3m);
|
||||
T4m = VSUB(T4a, T4b);
|
||||
T3l = VSUB(T3j, T3k);
|
||||
T3o = VSUB(T3m, T3n);
|
||||
T3p = VFMA(LDK(KP414213562), T3o, T3l);
|
||||
T3E = VFNMS(LDK(KP414213562), T3l, T3o);
|
||||
TL = VFMA(LDK(KP414213562), TK, TB);
|
||||
T14 = VFNMS(LDK(KP414213562), T13, TU);
|
||||
T15 = VSUB(TL, T14);
|
||||
T2K = VADD(TL, T14);
|
||||
T1s = VSUB(T1p, T1r);
|
||||
T1t = VADD(T1g, T1l);
|
||||
T1u = VFNMS(LDK(KP707106781), T1t, T1s);
|
||||
T2F = VFMA(LDK(KP707106781), T1t, T1s);
|
||||
T3d = VSUB(T3b, T3c);
|
||||
T3g = VSUB(T3e, T3f);
|
||||
T3h = VADD(T3d, T3g);
|
||||
T3C = VSUB(T3g, T3d);
|
||||
T1b = VSUB(T18, T1a);
|
||||
T1m = VSUB(T1g, T1l);
|
||||
T1n = VFNMS(LDK(KP707106781), T1m, T1b);
|
||||
T2E = VFMA(LDK(KP707106781), T1m, T1b);
|
||||
T28 = VFMA(LDK(KP414213562), TU, T13);
|
||||
T29 = VFNMS(LDK(KP414213562), TB, TK);
|
||||
T2a = VSUB(T28, T29);
|
||||
T2z = VADD(T29, T28);
|
||||
{
|
||||
V T4o, T4u, T4r, T4v, T4k, T4n, T4p, T4q, T4s, T4x, T4t, T4w, T3y, T3K, T3H;
|
||||
V T3L, T3i, T3x, T3D, T3G, T3I, T3N, T3J, T3M, T46, T4g, T4d, T4h, T42, T45;
|
||||
V T49, T4c, T4e, T4j, T4f, T4i;
|
||||
T4k = VSUB(T40, T41);
|
||||
T4n = VADD(T4l, T4m);
|
||||
T4o = VFMA(LDK(KP707106781), T4n, T4k);
|
||||
T4u = VFNMS(LDK(KP707106781), T4n, T4k);
|
||||
T4p = VSUB(T44, T43);
|
||||
T4q = VSUB(T4m, T4l);
|
||||
T4r = VFMA(LDK(KP707106781), T4q, T4p);
|
||||
T4v = VFNMS(LDK(KP707106781), T4q, T4p);
|
||||
T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
|
||||
ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
|
||||
ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
|
||||
ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
|
||||
T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
|
||||
ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
|
||||
T3i = VFNMS(LDK(KP707106781), T3h, T3a);
|
||||
T3x = VSUB(T3p, T3w);
|
||||
T3y = VFMA(LDK(KP923879532), T3x, T3i);
|
||||
T3K = VFNMS(LDK(KP923879532), T3x, T3i);
|
||||
T3D = VFNMS(LDK(KP707106781), T3C, T3B);
|
||||
T3G = VSUB(T3E, T3F);
|
||||
T3H = VFNMS(LDK(KP923879532), T3G, T3D);
|
||||
T3L = VFMA(LDK(KP923879532), T3G, T3D);
|
||||
T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
|
||||
ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
|
||||
T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
|
||||
ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
|
||||
T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
|
||||
ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
|
||||
ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
|
||||
T42 = VADD(T40, T41);
|
||||
T45 = VADD(T43, T44);
|
||||
T46 = VSUB(T42, T45);
|
||||
T4g = VADD(T42, T45);
|
||||
T49 = VADD(T47, T48);
|
||||
T4c = VADD(T4a, T4b);
|
||||
T4d = VSUB(T49, T4c);
|
||||
T4h = VADD(T49, T4c);
|
||||
T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
|
||||
ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
|
||||
T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
|
||||
ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
|
||||
ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
|
||||
ST(&(Rp[0]), T4i, ms, &(Rp[0]));
|
||||
}
|
||||
T3O = VFMA(LDK(KP707106781), T3h, T3a);
|
||||
T3P = VADD(T3F, T3E);
|
||||
T3Q = VFMA(LDK(KP923879532), T3P, T3O);
|
||||
T3W = VFNMS(LDK(KP923879532), T3P, T3O);
|
||||
T3R = VFMA(LDK(KP707106781), T3C, T3B);
|
||||
T3S = VADD(T3w, T3p);
|
||||
T3T = VFMA(LDK(KP923879532), T3S, T3R);
|
||||
T3X = VFNMS(LDK(KP923879532), T3S, T3R);
|
||||
T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
|
||||
ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
|
||||
ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
|
||||
ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
|
||||
T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
|
||||
ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
|
||||
{
|
||||
V T2I, T35, T2S, T31, T2P, T34, T2T, T2Y, T2A, T2Z, T2H, T30, T2D, T2G, T2L;
|
||||
V T2W, T2O, T2X, T2M, T2N, T2Q, T36, T37, T2R, T2U, T32, T33, T2V, T20, T2v;
|
||||
V T2i, T2r, T2f, T2u, T2j, T2o, T16, T2p, T1Z, T2q, T1v, T1Y, T2b, T2m, T2e;
|
||||
V T2n, T2c, T2d, T2g, T2w, T2x, T2h, T2k, T2s, T2t, T2l;
|
||||
T2A = VFNMS(LDK(KP923879532), T2z, T2y);
|
||||
T2Z = VFMA(LDK(KP923879532), T2K, T2J);
|
||||
T2D = VFMA(LDK(KP198912367), T2C, T2B);
|
||||
T2G = VFNMS(LDK(KP198912367), T2F, T2E);
|
||||
T2H = VSUB(T2D, T2G);
|
||||
T30 = VADD(T2D, T2G);
|
||||
T2I = VFMA(LDK(KP980785280), T2H, T2A);
|
||||
T35 = VFNMS(LDK(KP980785280), T30, T2Z);
|
||||
T2S = VFNMS(LDK(KP980785280), T2H, T2A);
|
||||
T31 = VFMA(LDK(KP980785280), T30, T2Z);
|
||||
T2L = VFNMS(LDK(KP923879532), T2K, T2J);
|
||||
T2W = VFMA(LDK(KP923879532), T2z, T2y);
|
||||
T2M = VFMA(LDK(KP198912367), T2E, T2F);
|
||||
T2N = VFNMS(LDK(KP198912367), T2B, T2C);
|
||||
T2O = VSUB(T2M, T2N);
|
||||
T2X = VADD(T2N, T2M);
|
||||
T2P = VFMA(LDK(KP980785280), T2O, T2L);
|
||||
T34 = VFNMS(LDK(KP980785280), T2X, T2W);
|
||||
T2T = VFNMS(LDK(KP980785280), T2O, T2L);
|
||||
T2Y = VFMA(LDK(KP980785280), T2X, T2W);
|
||||
T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
|
||||
ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
|
||||
T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
|
||||
ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
|
||||
T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
|
||||
ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
|
||||
T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
|
||||
ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
|
||||
T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
|
||||
ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
|
||||
T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
|
||||
ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
|
||||
T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
|
||||
ST(&(Rm[0]), T33, -ms, &(Rm[0]));
|
||||
T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
|
||||
ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
|
||||
T16 = VFNMS(LDK(KP923879532), T15, Ts);
|
||||
T2p = VFMA(LDK(KP923879532), T2a, T27);
|
||||
T1v = VFMA(LDK(KP668178637), T1u, T1n);
|
||||
T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
|
||||
T1Z = VSUB(T1v, T1Y);
|
||||
T2q = VADD(T1Y, T1v);
|
||||
T20 = VFMA(LDK(KP831469612), T1Z, T16);
|
||||
T2v = VFNMS(LDK(KP831469612), T2q, T2p);
|
||||
T2i = VFNMS(LDK(KP831469612), T1Z, T16);
|
||||
T2r = VFMA(LDK(KP831469612), T2q, T2p);
|
||||
T2b = VFNMS(LDK(KP923879532), T2a, T27);
|
||||
T2m = VFMA(LDK(KP923879532), T15, Ts);
|
||||
T2c = VFNMS(LDK(KP668178637), T1n, T1u);
|
||||
T2d = VFMA(LDK(KP668178637), T1Q, T1X);
|
||||
T2e = VSUB(T2c, T2d);
|
||||
T2n = VADD(T2d, T2c);
|
||||
T2f = VFNMS(LDK(KP831469612), T2e, T2b);
|
||||
T2u = VFNMS(LDK(KP831469612), T2n, T2m);
|
||||
T2j = VFMA(LDK(KP831469612), T2e, T2b);
|
||||
T2o = VFMA(LDK(KP831469612), T2n, T2m);
|
||||
T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
|
||||
ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
|
||||
T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
|
||||
ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
|
||||
T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
|
||||
ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
|
||||
T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
|
||||
ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
|
||||
T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
|
||||
ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
|
||||
T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
|
||||
ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
|
||||
T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
|
||||
ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
|
||||
T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
|
||||
ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
VTW(1, 20),
|
||||
VTW(1, 21),
|
||||
VTW(1, 22),
|
||||
VTW(1, 23),
|
||||
VTW(1, 24),
|
||||
VTW(1, 25),
|
||||
VTW(1, 26),
|
||||
VTW(1, 27),
|
||||
VTW(1, 28),
|
||||
VTW(1, 29),
|
||||
VTW(1, 30),
|
||||
VTW(1, 31),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, { 119, 94, 130, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 249 FP additions, 133 FP multiplications,
|
||||
* (or, 233 additions, 117 multiplications, 16 fused multiply/add),
|
||||
* 130 stack variables, 9 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
|
||||
DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
|
||||
DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
|
||||
V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
|
||||
V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
|
||||
V T2d, T2U;
|
||||
{
|
||||
V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
|
||||
V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
|
||||
V T1u, T1z;
|
||||
{
|
||||
V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
|
||||
V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
|
||||
V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
T4 = VADD(T1, T3);
|
||||
T1l = LDW(&(W[0]));
|
||||
T1m = VZMULIJ(T1l, VSUB(T3, T1));
|
||||
T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
|
||||
T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
|
||||
T1F = VCONJ(T1E);
|
||||
T1D = LDW(&(W[TWVL * 16]));
|
||||
T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
|
||||
T2i = LDW(&(W[TWVL * 14]));
|
||||
T2j = VZMULJ(T2i, VADD(T1G, T1F));
|
||||
T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
|
||||
T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
|
||||
T1K = VCONJ(T1J);
|
||||
T1I = LDW(&(W[TWVL * 48]));
|
||||
T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
|
||||
T2k = LDW(&(W[TWVL * 46]));
|
||||
T2l = VZMULJ(T2k, VADD(T1L, T1K));
|
||||
T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
|
||||
T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
T5 = LDW(&(W[TWVL * 30]));
|
||||
T9 = VZMULJ(T5, VADD(T6, T8));
|
||||
T1n = LDW(&(W[TWVL * 32]));
|
||||
T1o = VZMULIJ(T1n, VSUB(T8, T6));
|
||||
Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Te = VCONJ(Td);
|
||||
Tb = LDW(&(W[TWVL * 6]));
|
||||
Tf = VZMULJ(Tb, VADD(Tc, Te));
|
||||
T1q = LDW(&(W[TWVL * 8]));
|
||||
T1r = VZMULIJ(T1q, VSUB(Te, Tc));
|
||||
Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
|
||||
To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
|
||||
Tp = VCONJ(To);
|
||||
Tm = LDW(&(W[TWVL * 54]));
|
||||
Tq = VZMULJ(Tm, VADD(Tn, Tp));
|
||||
T1v = LDW(&(W[TWVL * 56]));
|
||||
T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
|
||||
Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
|
||||
Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
|
||||
Tu = VCONJ(Tt);
|
||||
Tr = LDW(&(W[TWVL * 22]));
|
||||
Tv = VZMULJ(Tr, VADD(Ts, Tu));
|
||||
T1x = LDW(&(W[TWVL * 24]));
|
||||
T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
|
||||
Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
|
||||
Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
|
||||
Tj = VCONJ(Ti);
|
||||
Tg = LDW(&(W[TWVL * 38]));
|
||||
Tk = VZMULJ(Tg, VADD(Th, Tj));
|
||||
T1s = LDW(&(W[TWVL * 40]));
|
||||
T1t = VZMULIJ(T1s, VSUB(Tj, Th));
|
||||
}
|
||||
Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
|
||||
T2m = VSUB(T2j, T2l);
|
||||
Tl = VSUB(Tf, Tk);
|
||||
Tw = VSUB(Tq, Tv);
|
||||
Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
|
||||
T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
|
||||
T3P = VADD(Tq, Tv);
|
||||
T3Q = VADD(Tf, Tk);
|
||||
T3R = VSUB(T3P, T3Q);
|
||||
T4h = VADD(T3Q, T3P);
|
||||
T3o = VADD(T4, T9);
|
||||
T3p = VADD(T2j, T2l);
|
||||
T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
|
||||
T4g = VADD(T3o, T3p);
|
||||
T3z = VADD(T1m, T1o);
|
||||
T3A = VADD(T1H, T1M);
|
||||
T3B = VSUB(T3z, T3A);
|
||||
T4n = VADD(T3z, T3A);
|
||||
T3C = VADD(T1w, T1y);
|
||||
T3D = VADD(T1r, T1t);
|
||||
T3E = VSUB(T3C, T3D);
|
||||
T4o = VADD(T3D, T3C);
|
||||
T1p = VSUB(T1m, T1o);
|
||||
T1N = VSUB(T1H, T1M);
|
||||
T1u = VSUB(T1r, T1t);
|
||||
T1z = VSUB(T1w, T1y);
|
||||
T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
|
||||
T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
|
||||
T1B = VADD(T1p, T1A);
|
||||
T2S = VADD(T1N, T1C);
|
||||
T1O = VSUB(T1C, T1N);
|
||||
T2R = VSUB(T1p, T1A);
|
||||
}
|
||||
{
|
||||
V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
|
||||
V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
|
||||
V T20, T27, T1U, T1Z;
|
||||
{
|
||||
V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
|
||||
V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
|
||||
V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
|
||||
TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TC = VCONJ(TB);
|
||||
Tz = LDW(&(W[TWVL * 2]));
|
||||
TD = VZMULJ(Tz, VADD(TA, TC));
|
||||
T1Q = LDW(&(W[TWVL * 4]));
|
||||
T1R = VZMULIJ(T1Q, VSUB(TC, TA));
|
||||
T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T1a = VCONJ(T19);
|
||||
T17 = LDW(&(W[TWVL * 10]));
|
||||
T1b = VZMULJ(T17, VADD(T18, T1a));
|
||||
T28 = LDW(&(W[TWVL * 12]));
|
||||
T29 = VZMULIJ(T28, VSUB(T1a, T18));
|
||||
T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T1f = VCONJ(T1e);
|
||||
T1c = LDW(&(W[TWVL * 42]));
|
||||
T1g = VZMULJ(T1c, VADD(T1d, T1f));
|
||||
T2a = LDW(&(W[TWVL * 44]));
|
||||
T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
|
||||
TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TH = VCONJ(TG);
|
||||
TE = LDW(&(W[TWVL * 34]));
|
||||
TI = VZMULJ(TE, VADD(TF, TH));
|
||||
T1S = LDW(&(W[TWVL * 36]));
|
||||
T1T = VZMULIJ(T1S, VSUB(TH, TF));
|
||||
TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TN = VCONJ(TM);
|
||||
TK = LDW(&(W[TWVL * 18]));
|
||||
TO = VZMULJ(TK, VADD(TL, TN));
|
||||
T1X = LDW(&(W[TWVL * 20]));
|
||||
T1Y = VZMULIJ(T1X, VSUB(TN, TL));
|
||||
TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TZ = VCONJ(TY);
|
||||
TW = LDW(&(W[TWVL * 58]));
|
||||
T10 = VZMULJ(TW, VADD(TX, TZ));
|
||||
T21 = LDW(&(W[TWVL * 60]));
|
||||
T22 = VZMULIJ(T21, VSUB(TZ, TX));
|
||||
T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T14 = VCONJ(T13);
|
||||
T11 = LDW(&(W[TWVL * 26]));
|
||||
T15 = VZMULJ(T11, VADD(T12, T14));
|
||||
T23 = LDW(&(W[TWVL * 28]));
|
||||
T24 = VZMULIJ(T23, VSUB(T14, T12));
|
||||
TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
|
||||
TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
TS = VCONJ(TR);
|
||||
TP = LDW(&(W[TWVL * 50]));
|
||||
TT = VZMULJ(TP, VADD(TQ, TS));
|
||||
T1V = LDW(&(W[TWVL * 52]));
|
||||
T1W = VZMULIJ(T1V, VSUB(TS, TQ));
|
||||
}
|
||||
TJ = VSUB(TD, TI);
|
||||
TU = VSUB(TO, TT);
|
||||
TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
|
||||
T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
|
||||
T16 = VSUB(T10, T15);
|
||||
T1h = VSUB(T1b, T1g);
|
||||
T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
|
||||
T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
|
||||
T3J = VADD(T1Y, T1W);
|
||||
T3K = VADD(T1R, T1T);
|
||||
T3L = VSUB(T3J, T3K);
|
||||
T4q = VADD(T3K, T3J);
|
||||
T3G = VADD(T22, T24);
|
||||
T3H = VADD(T29, T2b);
|
||||
T3I = VSUB(T3G, T3H);
|
||||
T4r = VADD(T3G, T3H);
|
||||
T3u = VADD(T10, T15);
|
||||
T3v = VADD(T1b, T1g);
|
||||
T3w = VSUB(T3u, T3v);
|
||||
T4k = VADD(T3u, T3v);
|
||||
T3r = VADD(TD, TI);
|
||||
T3s = VADD(TO, TT);
|
||||
T3t = VSUB(T3r, T3s);
|
||||
T4j = VADD(T3r, T3s);
|
||||
T25 = VSUB(T22, T24);
|
||||
T2c = VSUB(T29, T2b);
|
||||
T1U = VSUB(T1R, T1T);
|
||||
T1Z = VSUB(T1W, T1Y);
|
||||
T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
|
||||
T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
|
||||
T26 = VADD(T20, T25);
|
||||
T2V = VADD(T27, T2c);
|
||||
T2d = VSUB(T27, T2c);
|
||||
T2U = VSUB(T25, T20);
|
||||
}
|
||||
{
|
||||
V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
|
||||
V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
|
||||
V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
|
||||
V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
|
||||
T4i = VADD(T4g, T4h);
|
||||
T4l = VADD(T4j, T4k);
|
||||
T4m = VADD(T4i, T4l);
|
||||
T4w = VSUB(T4i, T4l);
|
||||
T4p = VADD(T4n, T4o);
|
||||
T4s = VADD(T4q, T4r);
|
||||
T4t = VADD(T4p, T4s);
|
||||
T4x = VBYI(VSUB(T4s, T4p));
|
||||
T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
|
||||
ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
|
||||
ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
|
||||
T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
|
||||
ST(&(Rp[0]), T4v, ms, &(Rp[0]));
|
||||
T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
|
||||
ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
|
||||
T4F = VSUB(T4k, T4j);
|
||||
T4B = VSUB(T4n, T4o);
|
||||
T4C = VSUB(T4r, T4q);
|
||||
T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
|
||||
T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
|
||||
T4E = VADD(T4A, T4D);
|
||||
T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
|
||||
T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
|
||||
T4K = VSUB(T4A, T4D);
|
||||
T4I = VCONJ(VSUB(T4E, T4H));
|
||||
ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4N = VADD(T4K, T4L);
|
||||
ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
|
||||
T4J = VADD(T4E, T4H);
|
||||
ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
|
||||
T4M = VCONJ(VSUB(T4K, T4L));
|
||||
ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
|
||||
T3y = VADD(T3q, T3x);
|
||||
T47 = VSUB(T3q, T3x);
|
||||
T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
|
||||
T3T = VADD(T3R, T3S);
|
||||
T45 = VSUB(T3S, T3R);
|
||||
T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
|
||||
T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
|
||||
T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
|
||||
T44 = VSUB(T3M, T3F);
|
||||
T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
|
||||
T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
|
||||
T3W = VADD(T3U, T3V);
|
||||
T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
|
||||
T3O = VADD(T3y, T3N);
|
||||
T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
|
||||
T4d = VADD(T47, T48);
|
||||
T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
|
||||
T40 = VSUB(T3y, T3N);
|
||||
T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
|
||||
T49 = VSUB(T47, T48);
|
||||
T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
|
||||
T3Y = VCONJ(VSUB(T3O, T3X));
|
||||
ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4e = VADD(T4c, T4d);
|
||||
ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
|
||||
T4f = VCONJ(VSUB(T4d, T4c));
|
||||
ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
|
||||
T3Z = VADD(T3O, T3X);
|
||||
ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
|
||||
T42 = VCONJ(VSUB(T40, T41));
|
||||
ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
|
||||
T4a = VADD(T46, T49);
|
||||
ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
|
||||
T4b = VCONJ(VSUB(T49, T46));
|
||||
ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
|
||||
T43 = VADD(T40, T41);
|
||||
ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
|
||||
{
|
||||
V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
|
||||
V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
|
||||
V T2J, T2B;
|
||||
Ty = VADD(Ta, Tx);
|
||||
T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
|
||||
T1k = VADD(Ty, T1j);
|
||||
T2F = VSUB(Ty, T1j);
|
||||
T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
|
||||
T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
|
||||
T2u = VADD(T2s, T2t);
|
||||
T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
|
||||
T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
|
||||
T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
|
||||
T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
|
||||
T2C = VSUB(T2e, T1P);
|
||||
T2n = VSUB(T2h, T2m);
|
||||
T2q = VSUB(T2o, T2p);
|
||||
T2r = VADD(T2n, T2q);
|
||||
T2D = VSUB(T2q, T2n);
|
||||
T2g = VADD(T1k, T2f);
|
||||
T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
|
||||
T2L = VADD(T2F, T2G);
|
||||
T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
|
||||
T2y = VSUB(T1k, T2f);
|
||||
T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
|
||||
T2H = VSUB(T2F, T2G);
|
||||
T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
|
||||
T2w = VCONJ(VSUB(T2g, T2v));
|
||||
ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
|
||||
T2M = VADD(T2K, T2L);
|
||||
ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
|
||||
T2N = VCONJ(VSUB(T2L, T2K));
|
||||
ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
|
||||
T2x = VADD(T2g, T2v);
|
||||
ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
|
||||
T2A = VCONJ(VSUB(T2y, T2z));
|
||||
ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
|
||||
T2I = VADD(T2E, T2H);
|
||||
ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
|
||||
T2J = VCONJ(VSUB(T2H, T2E));
|
||||
ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
|
||||
T2B = VADD(T2y, T2z);
|
||||
ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
{
|
||||
V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
|
||||
V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
|
||||
V T3j, T3b;
|
||||
T2O = VSUB(Ta, Tx);
|
||||
T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
|
||||
T2Q = VADD(T2O, T2P);
|
||||
T3f = VSUB(T2O, T2P);
|
||||
T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
|
||||
T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
|
||||
T34 = VADD(T32, T33);
|
||||
T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
|
||||
T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
|
||||
T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
|
||||
T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
|
||||
T3c = VSUB(T2W, T2T);
|
||||
T2Z = VADD(T2m, T2h);
|
||||
T30 = VSUB(T1i, TV);
|
||||
T31 = VADD(T2Z, T30);
|
||||
T3d = VSUB(T30, T2Z);
|
||||
T2Y = VADD(T2Q, T2X);
|
||||
T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
|
||||
T3l = VADD(T3f, T3g);
|
||||
T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
|
||||
T38 = VSUB(T2Q, T2X);
|
||||
T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
|
||||
T3h = VSUB(T3f, T3g);
|
||||
T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
|
||||
T36 = VCONJ(VSUB(T2Y, T35));
|
||||
ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
|
||||
T3m = VADD(T3k, T3l);
|
||||
ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
|
||||
T3n = VCONJ(VSUB(T3l, T3k));
|
||||
ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
|
||||
T37 = VADD(T2Y, T35);
|
||||
ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
|
||||
T3a = VCONJ(VSUB(T38, T39));
|
||||
ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
|
||||
T3i = VADD(T3e, T3h);
|
||||
ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
|
||||
T3j = VCONJ(VSUB(T3h, T3e));
|
||||
ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
|
||||
T3b = VADD(T38, T39);
|
||||
ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
VTW(1, 8),
|
||||
VTW(1, 9),
|
||||
VTW(1, 10),
|
||||
VTW(1, 11),
|
||||
VTW(1, 12),
|
||||
VTW(1, 13),
|
||||
VTW(1, 14),
|
||||
VTW(1, 15),
|
||||
VTW(1, 16),
|
||||
VTW(1, 17),
|
||||
VTW(1, 18),
|
||||
VTW(1, 19),
|
||||
VTW(1, 20),
|
||||
VTW(1, 21),
|
||||
VTW(1, 22),
|
||||
VTW(1, 23),
|
||||
VTW(1, 24),
|
||||
VTW(1, 25),
|
||||
VTW(1, 26),
|
||||
VTW(1, 27),
|
||||
VTW(1, 28),
|
||||
VTW(1, 29),
|
||||
VTW(1, 30),
|
||||
VTW(1, 31),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, { 233, 117, 16, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
146
fftw-3.3.10/rdft/simd/common/hc2cfdftv_4.c
Normal file
146
fftw-3.3.10/rdft/simd/common/hc2cfdftv_4.c
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dit -name hc2cfdftv_4 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 15 FP additions, 16 FP multiplications,
|
||||
* (or, 9 additions, 10 multiplications, 6 fused multiply/add),
|
||||
* 21 stack variables, 1 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
V T8, Th, Td, Tg, T3, Tc, T7, Ta, T1, T2, Tb, T5, T6, T4, T9;
|
||||
V Te, Tj, Tf, Ti;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
Tb = LDW(&(W[0]));
|
||||
Tc = VZMULIJ(Tb, VFNMSCONJ(T2, T1));
|
||||
T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = LDW(&(W[TWVL * 2]));
|
||||
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
|
||||
T9 = LDW(&(W[TWVL * 4]));
|
||||
Ta = VZMULIJ(T9, VFNMSCONJ(T6, T5));
|
||||
T8 = VSUB(T3, T7);
|
||||
Th = VADD(Tc, Ta);
|
||||
Td = VSUB(Ta, Tc);
|
||||
Tg = VADD(T3, T7);
|
||||
Te = VMUL(LDK(KP500000000), VFNMSI(Td, T8));
|
||||
ST(&(Rp[WS(rs, 1)]), Te, ms, &(Rp[WS(rs, 1)]));
|
||||
Tj = VCONJ(VMUL(LDK(KP500000000), VADD(Th, Tg)));
|
||||
ST(&(Rm[WS(rs, 1)]), Tj, -ms, &(Rm[WS(rs, 1)]));
|
||||
Tf = VCONJ(VMUL(LDK(KP500000000), VFMAI(Td, T8)));
|
||||
ST(&(Rm[0]), Tf, -ms, &(Rm[0]));
|
||||
Ti = VMUL(LDK(KP500000000), VSUB(Tg, Th));
|
||||
ST(&(Rp[0]), Ti, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cfdftv_4"), twinstr, &GENUS, { 9, 10, 6, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 4 -dit -name hc2cfdftv_4 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 15 FP additions, 10 FP multiplications,
|
||||
* (or, 15 additions, 10 multiplications, 0 fused multiply/add),
|
||||
* 23 stack variables, 1 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 6)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
V T4, Tc, T9, Te, T1, T3, T2, Tb, T6, T8, T7, T5, Td, Tg, Th;
|
||||
V Ta, Tf, Tk, Tl, Ti, Tj;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
T4 = VADD(T1, T3);
|
||||
Tb = LDW(&(W[0]));
|
||||
Tc = VZMULIJ(Tb, VSUB(T3, T1));
|
||||
T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T7 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T8 = VCONJ(T7);
|
||||
T5 = LDW(&(W[TWVL * 2]));
|
||||
T9 = VZMULJ(T5, VADD(T6, T8));
|
||||
Td = LDW(&(W[TWVL * 4]));
|
||||
Te = VZMULIJ(Td, VSUB(T8, T6));
|
||||
Ta = VSUB(T4, T9);
|
||||
Tf = VBYI(VSUB(Tc, Te));
|
||||
Tg = VMUL(LDK(KP500000000), VSUB(Ta, Tf));
|
||||
Th = VCONJ(VMUL(LDK(KP500000000), VADD(Ta, Tf)));
|
||||
ST(&(Rp[WS(rs, 1)]), Tg, ms, &(Rp[WS(rs, 1)]));
|
||||
ST(&(Rm[0]), Th, -ms, &(Rm[0]));
|
||||
Ti = VADD(T4, T9);
|
||||
Tj = VADD(Tc, Te);
|
||||
Tk = VCONJ(VMUL(LDK(KP500000000), VSUB(Ti, Tj)));
|
||||
Tl = VMUL(LDK(KP500000000), VADD(Ti, Tj));
|
||||
ST(&(Rm[WS(rs, 1)]), Tk, -ms, &(Rm[WS(rs, 1)]));
|
||||
ST(&(Rp[0]), Tl, ms, &(Rp[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, XSIMD_STRING("hc2cfdftv_4"), twinstr, &GENUS, { 15, 10, 0, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
192
fftw-3.3.10/rdft/simd/common/hc2cfdftv_6.c
Normal file
192
fftw-3.3.10/rdft/simd/common/hc2cfdftv_6.c
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dit -name hc2cfdftv_6 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 29 FP additions, 30 FP multiplications,
|
||||
* (or, 17 additions, 18 multiplications, 12 fused multiply/add),
|
||||
* 38 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
V T8, Tr, Tf, Tk, Tl, Ts, Tt, Tu, T3, Tj, Te, Th, T7, Ta, T1;
|
||||
V T2, Ti, Tc, Td, Tb, Tg, T5, T6, T4, T9, Tm, Tv, Tp, Tq, Tn;
|
||||
V To, Ty, Tz, Tw, Tx;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
Ti = LDW(&(W[0]));
|
||||
Tj = VZMULIJ(Ti, VFNMSCONJ(T2, T1));
|
||||
Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tb = LDW(&(W[TWVL * 8]));
|
||||
Te = VZMULIJ(Tb, VFNMSCONJ(Td, Tc));
|
||||
Tg = LDW(&(W[TWVL * 6]));
|
||||
Th = VZMULJ(Tg, VFMACONJ(Td, Tc));
|
||||
T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T4 = LDW(&(W[TWVL * 4]));
|
||||
T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
|
||||
T9 = LDW(&(W[TWVL * 2]));
|
||||
Ta = VZMULJ(T9, VFMACONJ(T6, T5));
|
||||
T8 = VSUB(T3, T7);
|
||||
Tr = VADD(T3, T7);
|
||||
Tf = VSUB(Ta, Te);
|
||||
Tk = VSUB(Th, Tj);
|
||||
Tl = VADD(Tf, Tk);
|
||||
Ts = VADD(Ta, Te);
|
||||
Tt = VADD(Tj, Th);
|
||||
Tu = VADD(Ts, Tt);
|
||||
Tm = VMUL(LDK(KP500000000), VADD(T8, Tl));
|
||||
ST(&(Rp[0]), Tm, ms, &(Rp[0]));
|
||||
Tv = VCONJ(VMUL(LDK(KP500000000), VADD(Tr, Tu)));
|
||||
ST(&(Rm[WS(rs, 2)]), Tv, -ms, &(Rm[0]));
|
||||
Tn = VFNMS(LDK(KP500000000), Tl, T8);
|
||||
To = VMUL(LDK(KP866025403), VSUB(Tk, Tf));
|
||||
Tp = VMUL(LDK(KP500000000), VFNMSI(To, Tn));
|
||||
Tq = VCONJ(VMUL(LDK(KP500000000), VFMAI(To, Tn)));
|
||||
ST(&(Rp[WS(rs, 2)]), Tp, ms, &(Rp[0]));
|
||||
ST(&(Rm[WS(rs, 1)]), Tq, -ms, &(Rm[WS(rs, 1)]));
|
||||
Tw = VFNMS(LDK(KP500000000), Tu, Tr);
|
||||
Tx = VMUL(LDK(KP866025403), VSUB(Tt, Ts));
|
||||
Ty = VCONJ(VMUL(LDK(KP500000000), VFNMSI(Tx, Tw)));
|
||||
Tz = VMUL(LDK(KP500000000), VFMAI(Tx, Tw));
|
||||
ST(&(Rm[0]), Ty, -ms, &(Rm[0]));
|
||||
ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cfdftv_6"), twinstr, &GENUS, { 17, 18, 12, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_6, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dit -name hc2cfdftv_6 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 29 FP additions, 20 FP multiplications,
|
||||
* (or, 27 additions, 18 multiplications, 2 fused multiply/add),
|
||||
* 42 stack variables, 3 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
V Ta, Tu, Tn, Tw, Ti, Tv, T1, T8, Tg, Tf, T7, T3, Te, T6, T2;
|
||||
V T4, T9, T5, Tk, Tm, Tj, Tl, Tc, Th, Tb, Td, Tr, Tp, Tq, To;
|
||||
V Tt, Ts, TA, Ty, Tz, Tx, TC, TB;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T8 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
Te = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
Tf = VCONJ(Te);
|
||||
T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
T7 = VCONJ(T6);
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
T4 = VADD(T1, T3);
|
||||
T5 = LDW(&(W[TWVL * 4]));
|
||||
T9 = VZMULIJ(T5, VSUB(T7, T8));
|
||||
Ta = VADD(T4, T9);
|
||||
Tu = VSUB(T4, T9);
|
||||
Tj = LDW(&(W[0]));
|
||||
Tk = VZMULIJ(Tj, VSUB(T3, T1));
|
||||
Tl = LDW(&(W[TWVL * 6]));
|
||||
Tm = VZMULJ(Tl, VADD(Tf, Tg));
|
||||
Tn = VADD(Tk, Tm);
|
||||
Tw = VSUB(Tm, Tk);
|
||||
Tb = LDW(&(W[TWVL * 2]));
|
||||
Tc = VZMULJ(Tb, VADD(T7, T8));
|
||||
Td = LDW(&(W[TWVL * 8]));
|
||||
Th = VZMULIJ(Td, VSUB(Tf, Tg));
|
||||
Ti = VADD(Tc, Th);
|
||||
Tv = VSUB(Tc, Th);
|
||||
Tr = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(Tn, Ti))));
|
||||
To = VADD(Ti, Tn);
|
||||
Tp = VMUL(LDK(KP500000000), VADD(Ta, To));
|
||||
Tq = VFNMS(LDK(KP250000000), To, VMUL(LDK(KP500000000), Ta));
|
||||
ST(&(Rp[0]), Tp, ms, &(Rp[0]));
|
||||
Tt = VCONJ(VADD(Tq, Tr));
|
||||
ST(&(Rm[WS(rs, 1)]), Tt, -ms, &(Rm[WS(rs, 1)]));
|
||||
Ts = VSUB(Tq, Tr);
|
||||
ST(&(Rp[WS(rs, 2)]), Ts, ms, &(Rp[0]));
|
||||
TA = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(Tw, Tv))));
|
||||
Tx = VADD(Tv, Tw);
|
||||
Ty = VCONJ(VMUL(LDK(KP500000000), VADD(Tu, Tx)));
|
||||
Tz = VFNMS(LDK(KP250000000), Tx, VMUL(LDK(KP500000000), Tu));
|
||||
ST(&(Rm[WS(rs, 2)]), Ty, -ms, &(Rm[0]));
|
||||
TC = VADD(Tz, TA);
|
||||
ST(&(Rp[WS(rs, 1)]), TC, ms, &(Rp[WS(rs, 1)]));
|
||||
TB = VCONJ(VSUB(Tz, TA));
|
||||
ST(&(Rm[0]), TB, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cfdftv_6"), twinstr, &GENUS, { 27, 18, 2, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_6, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
231
fftw-3.3.10/rdft/simd/common/hc2cfdftv_8.c
Normal file
231
fftw-3.3.10/rdft/simd/common/hc2cfdftv_8.c
Normal file
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:47:22 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dit -name hc2cfdftv_8 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 41 FP additions, 40 FP multiplications,
|
||||
* (or, 23 additions, 22 multiplications, 18 fused multiply/add),
|
||||
* 52 stack variables, 2 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
V T8, Tt, TG, TF, TD, TC, Tn, Tu, T3, Tc, Tl, Ts, T7, Ta, Th;
|
||||
V Tq, T1, T2, Tb, Tj, Tk, Ti, Tr, T5, T6, T4, T9, Tf, Tg, Te;
|
||||
V Tp, Td, Tm, Tw, Tx, To, Tv, TM, TN, TK, TL, TA, TB, Ty, Tz;
|
||||
V TI, TJ, TE, TH;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VFMACONJ(T2, T1);
|
||||
Tb = LDW(&(W[0]));
|
||||
Tc = VZMULIJ(Tb, VFNMSCONJ(T2, T1));
|
||||
Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tk = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Ti = LDW(&(W[TWVL * 12]));
|
||||
Tl = VZMULIJ(Ti, VFNMSCONJ(Tk, Tj));
|
||||
Tr = LDW(&(W[TWVL * 10]));
|
||||
Ts = VZMULJ(Tr, VFMACONJ(Tk, Tj));
|
||||
T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T4 = LDW(&(W[TWVL * 6]));
|
||||
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
|
||||
T9 = LDW(&(W[TWVL * 8]));
|
||||
Ta = VZMULIJ(T9, VFNMSCONJ(T6, T5));
|
||||
Tf = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Te = LDW(&(W[TWVL * 4]));
|
||||
Th = VZMULIJ(Te, VFNMSCONJ(Tg, Tf));
|
||||
Tp = LDW(&(W[TWVL * 2]));
|
||||
Tq = VZMULJ(Tp, VFMACONJ(Tg, Tf));
|
||||
T8 = VSUB(T3, T7);
|
||||
Tt = VSUB(Tq, Ts);
|
||||
TG = VADD(Th, Tl);
|
||||
TF = VADD(Tc, Ta);
|
||||
TD = VADD(Tq, Ts);
|
||||
TC = VADD(T3, T7);
|
||||
Td = VSUB(Ta, Tc);
|
||||
Tm = VSUB(Th, Tl);
|
||||
Tn = VADD(Td, Tm);
|
||||
Tu = VSUB(Tm, Td);
|
||||
To = VFMA(LDK(KP707106781), Tn, T8);
|
||||
Tv = VFNMS(LDK(KP707106781), Tu, Tt);
|
||||
Tw = VMUL(LDK(KP500000000), VFNMSI(Tv, To));
|
||||
Tx = VCONJ(VMUL(LDK(KP500000000), VFMAI(Tv, To)));
|
||||
ST(&(Rp[WS(rs, 1)]), Tw, ms, &(Rp[WS(rs, 1)]));
|
||||
ST(&(Rm[0]), Tx, -ms, &(Rm[0]));
|
||||
TK = VADD(TC, TD);
|
||||
TL = VADD(TF, TG);
|
||||
TM = VMUL(LDK(KP500000000), VSUB(TK, TL));
|
||||
TN = VCONJ(VMUL(LDK(KP500000000), VADD(TL, TK)));
|
||||
ST(&(Rp[0]), TM, ms, &(Rp[0]));
|
||||
ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
|
||||
Ty = VFNMS(LDK(KP707106781), Tn, T8);
|
||||
Tz = VFMA(LDK(KP707106781), Tu, Tt);
|
||||
TA = VCONJ(VMUL(LDK(KP500000000), VFNMSI(Tz, Ty)));
|
||||
TB = VMUL(LDK(KP500000000), VFMAI(Tz, Ty));
|
||||
ST(&(Rm[WS(rs, 2)]), TA, -ms, &(Rm[0]));
|
||||
ST(&(Rp[WS(rs, 3)]), TB, ms, &(Rp[WS(rs, 1)]));
|
||||
TE = VSUB(TC, TD);
|
||||
TH = VSUB(TF, TG);
|
||||
TI = VMUL(LDK(KP500000000), VFMAI(TH, TE));
|
||||
TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TH, TE)));
|
||||
ST(&(Rp[WS(rs, 2)]), TI, ms, &(Rp[0]));
|
||||
ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cfdftv_8"), twinstr, &GENUS, { 23, 22, 18, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dit -name hc2cfdftv_8 -include rdft/simd/hc2cfv.h */
|
||||
|
||||
/*
|
||||
* This function contains 41 FP additions, 23 FP multiplications,
|
||||
* (or, 41 additions, 23 multiplications, 0 fused multiply/add),
|
||||
* 57 stack variables, 3 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/simd/hc2cfv.h"
|
||||
|
||||
static void hc2cfdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
V Ta, TE, Tr, TF, Tl, TK, Tw, TG, T1, T6, T3, T8, T2, T7, T4;
|
||||
V T9, T5, To, Tq, Tn, Tp, Tc, Th, Te, Tj, Td, Ti, Tf, Tk, Tb;
|
||||
V Tg, Tt, Tv, Ts, Tu, Ty, Tz, Tm, Tx, TC, TD, TA, TB, TI, TO;
|
||||
V TL, TP, TH, TJ, TM, TR, TN, TQ;
|
||||
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
|
||||
T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
|
||||
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
|
||||
T3 = VCONJ(T2);
|
||||
T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
|
||||
T8 = VCONJ(T7);
|
||||
T4 = VADD(T1, T3);
|
||||
T5 = LDW(&(W[TWVL * 6]));
|
||||
T9 = VZMULJ(T5, VADD(T6, T8));
|
||||
Ta = VADD(T4, T9);
|
||||
TE = VMUL(LDK(KP500000000), VSUB(T4, T9));
|
||||
Tn = LDW(&(W[0]));
|
||||
To = VZMULIJ(Tn, VSUB(T3, T1));
|
||||
Tp = LDW(&(W[TWVL * 8]));
|
||||
Tq = VZMULIJ(Tp, VSUB(T8, T6));
|
||||
Tr = VADD(To, Tq);
|
||||
TF = VSUB(To, Tq);
|
||||
Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Th = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
|
||||
Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Te = VCONJ(Td);
|
||||
Ti = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
|
||||
Tj = VCONJ(Ti);
|
||||
Tb = LDW(&(W[TWVL * 2]));
|
||||
Tf = VZMULJ(Tb, VADD(Tc, Te));
|
||||
Tg = LDW(&(W[TWVL * 10]));
|
||||
Tk = VZMULJ(Tg, VADD(Th, Tj));
|
||||
Tl = VADD(Tf, Tk);
|
||||
TK = VSUB(Tf, Tk);
|
||||
Ts = LDW(&(W[TWVL * 4]));
|
||||
Tt = VZMULIJ(Ts, VSUB(Te, Tc));
|
||||
Tu = LDW(&(W[TWVL * 12]));
|
||||
Tv = VZMULIJ(Tu, VSUB(Tj, Th));
|
||||
Tw = VADD(Tt, Tv);
|
||||
TG = VSUB(Tv, Tt);
|
||||
Tm = VADD(Ta, Tl);
|
||||
Tx = VADD(Tr, Tw);
|
||||
Ty = VCONJ(VMUL(LDK(KP500000000), VSUB(Tm, Tx)));
|
||||
Tz = VMUL(LDK(KP500000000), VADD(Tm, Tx));
|
||||
ST(&(Rm[WS(rs, 3)]), Ty, -ms, &(Rm[WS(rs, 1)]));
|
||||
ST(&(Rp[0]), Tz, ms, &(Rp[0]));
|
||||
TA = VSUB(Ta, Tl);
|
||||
TB = VBYI(VSUB(Tw, Tr));
|
||||
TC = VCONJ(VMUL(LDK(KP500000000), VSUB(TA, TB)));
|
||||
TD = VMUL(LDK(KP500000000), VADD(TA, TB));
|
||||
ST(&(Rm[WS(rs, 1)]), TC, -ms, &(Rm[WS(rs, 1)]));
|
||||
ST(&(Rp[WS(rs, 2)]), TD, ms, &(Rp[0]));
|
||||
TH = VMUL(LDK(KP353553390), VADD(TF, TG));
|
||||
TI = VADD(TE, TH);
|
||||
TO = VSUB(TE, TH);
|
||||
TJ = VMUL(LDK(KP707106781), VSUB(TG, TF));
|
||||
TL = VMUL(LDK(KP500000000), VBYI(VSUB(TJ, TK)));
|
||||
TP = VMUL(LDK(KP500000000), VBYI(VADD(TK, TJ)));
|
||||
TM = VCONJ(VSUB(TI, TL));
|
||||
ST(&(Rm[0]), TM, -ms, &(Rm[0]));
|
||||
TR = VADD(TO, TP);
|
||||
ST(&(Rp[WS(rs, 3)]), TR, ms, &(Rp[WS(rs, 1)]));
|
||||
TN = VADD(TI, TL);
|
||||
ST(&(Rp[WS(rs, 1)]), TN, ms, &(Rp[WS(rs, 1)]));
|
||||
TQ = VCONJ(VSUB(TO, TP));
|
||||
ST(&(Rm[WS(rs, 2)]), TQ, -ms, &(Rm[0]));
|
||||
}
|
||||
}
|
||||
VLEAVE();
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
VTW(1, 1),
|
||||
VTW(1, 2),
|
||||
VTW(1, 3),
|
||||
VTW(1, 4),
|
||||
VTW(1, 5),
|
||||
VTW(1, 6),
|
||||
VTW(1, 7),
|
||||
{ TW_NEXT, VL, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cfdftv_8"), twinstr, &GENUS, { 41, 23, 0, 0 } };
|
||||
|
||||
void XSIMD(codelet_hc2cfdftv_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdftv_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user