Updates
This commit is contained in:
15
fftw-3.3.10/simd-support/Makefile.am
Normal file
15
fftw-3.3.10/simd-support/Makefile.am
Normal file
@@ -0,0 +1,15 @@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = libsimd_support.la
|
||||
|
||||
libsimd_support_la_SOURCES = taint.c simd-common.h \
|
||||
x86-cpuid.h amd64-cpuid.h \
|
||||
simd-sse2.h sse2.c \
|
||||
avx.c simd-avx.h \
|
||||
avx-128-fma.c simd-avx-128-fma.h \
|
||||
avx2.c simd-avx2.h simd-avx2-128.h \
|
||||
avx512.c simd-avx512.h \
|
||||
kcvi.c simd-kcvi.h \
|
||||
altivec.c simd-altivec.h vsx.c simd-vsx.h \
|
||||
neon.c simd-neon.h \
|
||||
simd-generic128.h simd-generic256.h
|
||||
|
||||
680
fftw-3.3.10/simd-support/Makefile.in
Normal file
680
fftw-3.3.10/simd-support/Makefile.in
Normal file
@@ -0,0 +1,680 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = simd-support
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
libsimd_support_la_LIBADD =
|
||||
am_libsimd_support_la_OBJECTS = taint.lo sse2.lo avx.lo avx-128-fma.lo \
|
||||
avx2.lo avx512.lo kcvi.lo altivec.lo vsx.lo neon.lo
|
||||
libsimd_support_la_OBJECTS = $(am_libsimd_support_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/altivec.Plo \
|
||||
./$(DEPDIR)/avx-128-fma.Plo ./$(DEPDIR)/avx.Plo \
|
||||
./$(DEPDIR)/avx2.Plo ./$(DEPDIR)/avx512.Plo \
|
||||
./$(DEPDIR)/kcvi.Plo ./$(DEPDIR)/neon.Plo ./$(DEPDIR)/sse2.Plo \
|
||||
./$(DEPDIR)/taint.Plo ./$(DEPDIR)/vsx.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(libsimd_support_la_SOURCES)
|
||||
DIST_SOURCES = $(libsimd_support_la_SOURCES)
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = libsimd_support.la
|
||||
libsimd_support_la_SOURCES = taint.c simd-common.h \
|
||||
x86-cpuid.h amd64-cpuid.h \
|
||||
simd-sse2.h sse2.c \
|
||||
avx.c simd-avx.h \
|
||||
avx-128-fma.c simd-avx-128-fma.h \
|
||||
avx2.c simd-avx2.h simd-avx2-128.h \
|
||||
avx512.c simd-avx512.h \
|
||||
kcvi.c simd-kcvi.h \
|
||||
altivec.c simd-altivec.h vsx.c simd-vsx.h \
|
||||
neon.c simd-neon.h \
|
||||
simd-generic128.h simd-generic256.h
|
||||
|
||||
all: all-am
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu simd-support/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu simd-support/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
libsimd_support.la: $(libsimd_support_la_OBJECTS) $(libsimd_support_la_DEPENDENCIES) $(EXTRA_libsimd_support_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(libsimd_support_la_OBJECTS) $(libsimd_support_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/altivec.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx-128-fma.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx512.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kcvi.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/neon.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sse2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/taint.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vsx.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-am
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-am
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-am
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-am
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs:
|
||||
install: install-am
|
||||
install-exec: install-exec-am
|
||||
install-data: install-data-am
|
||||
uninstall: uninstall-am
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-am
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-am
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-am
|
||||
-rm -f ./$(DEPDIR)/altivec.Plo
|
||||
-rm -f ./$(DEPDIR)/avx-128-fma.Plo
|
||||
-rm -f ./$(DEPDIR)/avx.Plo
|
||||
-rm -f ./$(DEPDIR)/avx2.Plo
|
||||
-rm -f ./$(DEPDIR)/avx512.Plo
|
||||
-rm -f ./$(DEPDIR)/kcvi.Plo
|
||||
-rm -f ./$(DEPDIR)/neon.Plo
|
||||
-rm -f ./$(DEPDIR)/sse2.Plo
|
||||
-rm -f ./$(DEPDIR)/taint.Plo
|
||||
-rm -f ./$(DEPDIR)/vsx.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-am
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-am
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-am
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-am
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-am
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-am
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-am
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-am
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-am
|
||||
-rm -f ./$(DEPDIR)/altivec.Plo
|
||||
-rm -f ./$(DEPDIR)/avx-128-fma.Plo
|
||||
-rm -f ./$(DEPDIR)/avx.Plo
|
||||
-rm -f ./$(DEPDIR)/avx2.Plo
|
||||
-rm -f ./$(DEPDIR)/avx512.Plo
|
||||
-rm -f ./$(DEPDIR)/kcvi.Plo
|
||||
-rm -f ./$(DEPDIR)/neon.Plo
|
||||
-rm -f ./$(DEPDIR)/sse2.Plo
|
||||
-rm -f ./$(DEPDIR)/taint.Plo
|
||||
-rm -f ./$(DEPDIR)/vsx.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-am
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-am
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-am
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: install-am install-strip
|
||||
|
||||
.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
|
||||
clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
cscopelist-am ctags ctags-am distclean distclean-compile \
|
||||
distclean-generic distclean-libtool distclean-tags distdir dvi \
|
||||
dvi-am html html-am info info-am install install-am \
|
||||
install-data install-data-am install-dvi install-dvi-am \
|
||||
install-exec install-exec-am install-html install-html-am \
|
||||
install-info install-info-am install-man install-pdf \
|
||||
install-pdf-am install-ps install-ps-am install-strip \
|
||||
installcheck installcheck-am installdirs maintainer-clean \
|
||||
maintainer-clean-generic mostlyclean mostlyclean-compile \
|
||||
mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
|
||||
tags tags-am uninstall uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
80
fftw-3.3.10/simd-support/altivec.c
Normal file
80
fftw-3.3.10/simd-support/altivec.c
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_ALTIVEC
|
||||
|
||||
#if HAVE_SYS_SYSCTL_H
|
||||
# include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if HAVE_SYS_SYSCTL_H && HAVE_SYSCTL && defined(CTL_HW) && defined(HW_VECTORUNIT)
|
||||
/* code for darwin */
|
||||
static int really_have_altivec(void)
|
||||
{
|
||||
int mib[2], altivecp;
|
||||
size_t len;
|
||||
mib[0] = CTL_HW;
|
||||
mib[1] = HW_VECTORUNIT;
|
||||
len = sizeof(altivecp);
|
||||
sysctl(mib, 2, &altivecp, &len, NULL, 0);
|
||||
return altivecp;
|
||||
}
|
||||
#else /* GNU/Linux and other non-Darwin systems (!HAVE_SYS_SYSCTL_H etc.) */
|
||||
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
static jmp_buf jb;
|
||||
|
||||
static void sighandler(int x)
|
||||
{
|
||||
longjmp(jb, 1);
|
||||
}
|
||||
|
||||
static int really_have_altivec(void)
|
||||
{
|
||||
void (*oldsig)(int);
|
||||
oldsig = signal(SIGILL, sighandler);
|
||||
if (setjmp(jb)) {
|
||||
signal(SIGILL, oldsig);
|
||||
return 0;
|
||||
} else {
|
||||
__asm__ __volatile__ (".long 0x10000484"); /* vor 0,0,0 */
|
||||
signal(SIGILL, oldsig);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int X(have_simd_altivec)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
if (!init) {
|
||||
res = really_have_altivec();
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
148
fftw-3.3.10/simd-support/amd64-cpuid.h
Normal file
148
fftw-3.3.10/simd-support/amd64-cpuid.h
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#if (_MSC_VER >= 1600) && !defined(__INTEL_COMPILER)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* cpuid version to get all registers. Donated by Erik Lindahl from Gromacs. */
|
||||
static inline void
|
||||
cpuid_all(int level, int ecxval, int *eax, int *ebx, int *ecx, int *edx)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
int CPUInfo[4];
|
||||
|
||||
#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
|
||||
/* MSVC 9.0 SP1 or later */
|
||||
__cpuidex(CPUInfo, level, ecxval);
|
||||
#else
|
||||
__cpuid(CPUInfo, level);
|
||||
#endif
|
||||
*eax = CPUInfo[0];
|
||||
*ebx = CPUInfo[1];
|
||||
*ecx = CPUInfo[2];
|
||||
*edx = CPUInfo[3];
|
||||
|
||||
# else
|
||||
/* Not MSVC */
|
||||
*eax = level;
|
||||
*ecx = ecxval;
|
||||
*ebx = 0;
|
||||
*edx = 0;
|
||||
/* No need to save ebx if we are not in pic mode */
|
||||
__asm__ ("cpuid \n\t"
|
||||
: "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
|
||||
# endif
|
||||
}
|
||||
|
||||
|
||||
static inline int cpuid_ecx(int op)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
# ifdef __INTEL_COMPILER
|
||||
int result;
|
||||
_asm {
|
||||
push rbx
|
||||
mov eax,op
|
||||
cpuid
|
||||
mov result,ecx
|
||||
pop rbx
|
||||
}
|
||||
return result;
|
||||
# else
|
||||
int cpu_info[4];
|
||||
__cpuid(cpu_info,op);
|
||||
return cpu_info[2];
|
||||
# endif
|
||||
# else
|
||||
int eax, ecx = 0, edx;
|
||||
|
||||
__asm__("pushq %%rbx\n\tcpuid\n\tpopq %%rbx"
|
||||
: "=a" (eax), "+c" (ecx), "=d" (edx)
|
||||
: "a" (op));
|
||||
return ecx;
|
||||
# endif
|
||||
}
|
||||
|
||||
static inline int cpuid_ebx(int op)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
# ifdef __INTEL_COMPILER
|
||||
int result;
|
||||
_asm {
|
||||
push rbx
|
||||
mov eax,op
|
||||
cpuid
|
||||
mov result,ebx
|
||||
pop rbx
|
||||
}
|
||||
return result;
|
||||
# else
|
||||
int cpu_info[4];
|
||||
__cpuid(cpu_info,op);
|
||||
return cpu_info[1];
|
||||
# endif
|
||||
# else
|
||||
int eax, ecx = 0, edx;
|
||||
|
||||
__asm__("pushq %%rbx\n\tcpuid\nmov %%ebx,%%ecx\n\tpopq %%rbx"
|
||||
: "=a" (eax), "+c" (ecx), "=d" (edx)
|
||||
: "a" (op));
|
||||
return ecx;
|
||||
# endif
|
||||
}
|
||||
|
||||
static inline int xgetbv_eax(int op)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
# ifdef __INTEL_COMPILER
|
||||
int veax, vedx;
|
||||
_asm {
|
||||
mov ecx,op
|
||||
xgetbv
|
||||
mov veax,eax
|
||||
mov vedx,edx
|
||||
}
|
||||
return veax;
|
||||
# else
|
||||
# if defined(_MSC_VER) && (_MSC_VER >= 1600)
|
||||
unsigned __int64 result;
|
||||
result = _xgetbv(op);
|
||||
return (int)result;
|
||||
# else
|
||||
# error "Need at least Visual Studio 10 SP1 for AVX support"
|
||||
# endif
|
||||
# endif
|
||||
# else
|
||||
int eax, edx;
|
||||
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
|
||||
return eax;
|
||||
#endif
|
||||
}
|
||||
57
fftw-3.3.10/simd-support/avx-128-fma.c
Normal file
57
fftw-3.3.10/simd-support/avx-128-fma.c
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_AVX_128_FMA
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
# include "amd64-cpuid.h"
|
||||
#else
|
||||
# include "x86-cpuid.h"
|
||||
#endif
|
||||
|
||||
int X(have_simd_avx_128_fma)(void)
|
||||
{
|
||||
static int init = 0, res = 0;
|
||||
int eax,ebx,ecx,edx;
|
||||
|
||||
if (!init)
|
||||
{
|
||||
/* Check if this is an AMD CPU */
|
||||
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
|
||||
|
||||
/* 0x68747541: "Auth" , 0x444d4163: "enti" , 0x69746e65: "cAMD" */
|
||||
if (ebx==0x68747541 && ecx==0x444d4163 && edx==0x69746e65)
|
||||
{
|
||||
/* OK, this is an AMD CPU. Check if we support FMA4 */
|
||||
cpuid_all(0x80000001,0,&eax,&ebx,&ecx,&edx);
|
||||
if(ecx & (1<<16))
|
||||
{
|
||||
res = 1;
|
||||
}
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
54
fftw-3.3.10/simd-support/avx.c
Normal file
54
fftw-3.3.10/simd-support/avx.c
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_AVX
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
# include "amd64-cpuid.h"
|
||||
#else
|
||||
# include "x86-cpuid.h"
|
||||
#endif
|
||||
|
||||
int X(have_simd_avx)(void)
|
||||
{
|
||||
static int init = 0, res = 0;
|
||||
int max_stdfn, eax, ebx, ecx, edx;
|
||||
|
||||
if (!init) {
|
||||
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
|
||||
max_stdfn = eax;
|
||||
if (max_stdfn >= 0x1) {
|
||||
/* have AVX and OSXSAVE? (implies XGETBV exists) */
|
||||
cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & 0x18000000) == 0x18000000) {
|
||||
/* have OS support for XMM, YMM? */
|
||||
res = ((xgetbv_eax(0) & 0x6) == 0x6);
|
||||
}
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
68
fftw-3.3.10/simd-support/avx2.c
Normal file
68
fftw-3.3.10/simd-support/avx2.c
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_AVX2
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
# include "amd64-cpuid.h"
|
||||
#else
|
||||
# include "x86-cpuid.h"
|
||||
#endif
|
||||
|
||||
int X(have_simd_avx2_128)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
int max_stdfn, eax, ebx, ecx, edx;
|
||||
|
||||
if (!init) {
|
||||
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
|
||||
max_stdfn = eax;
|
||||
if (max_stdfn >= 0x1) {
|
||||
/* have AVX and OSXSAVE? (implies XGETBV exists) */
|
||||
cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & 0x18000000) == 0x18000000) {
|
||||
/* have AVX2? */
|
||||
cpuid_all(7,0,&eax,&ebx,&ecx,&edx);
|
||||
if (ebx & (1 << 5)) {
|
||||
/* have OS support for XMM, YMM? */
|
||||
res = ((xgetbv_eax(0) & 0x6) == 0x6);
|
||||
}
|
||||
}
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
int X(have_simd_avx2)(void)
|
||||
{
|
||||
/*
|
||||
* For now 256-bit AVX2 support is identical to 128-bit.
|
||||
* This might change in the future if AMD released AVX2-capable
|
||||
* chips that work better with the 128-bit flavor, but since AMD
|
||||
* might actually change it to implement 256-bit AVX2 efficiently
|
||||
* by then we don't want to disable it before we know.
|
||||
*/
|
||||
return X(have_simd_avx2_128)();
|
||||
}
|
||||
#endif
|
||||
70
fftw-3.3.10/simd-support/avx512.c
Normal file
70
fftw-3.3.10/simd-support/avx512.c
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-11 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
||||
* Copyright (c) 2012-2013 Romain Dolbeau
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_AVX512
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
|
||||
#include "amd64-cpuid.h"
|
||||
|
||||
int X(have_simd_avx512)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
int max_stdfn, eax, ebx, ecx, edx;
|
||||
|
||||
/* NOTE: this code is a total guess. I don't have an avx512
|
||||
machine available. The code contributed by Erik Lindahl would
|
||||
crash on a machine without XGETBV, so I had to guess a fix. */
|
||||
if (!init) {
|
||||
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
|
||||
max_stdfn = eax;
|
||||
if (max_stdfn >= 0x1) {
|
||||
/* have OSXSAVE? (implies XGETBV exists) */
|
||||
cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & 0x08000000) == 0x08000000) {
|
||||
/* have AVX512? */
|
||||
cpuid_all(7,0,&eax,&ebx,&ecx,&edx);
|
||||
if (ebx & (1 << 16)) {
|
||||
/* have OS support for XMM, YMM, ZMM */
|
||||
int zmm_ymm_xmm = (7 << 5) | (1 << 2) | (1 << 1);
|
||||
res = ((xgetbv_eax(0) & zmm_ymm_xmm) == zmm_ymm_xmm);
|
||||
}
|
||||
}
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else /* 32-bit code */
|
||||
|
||||
#error "Avx512 is 64 bits only"
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
50
fftw-3.3.10/simd-support/kcvi.c
Normal file
50
fftw-3.3.10/simd-support/kcvi.c
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-11 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
||||
* Copyright (c) 2012-2013 Romain Dolbeau
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_KCVI
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
|
||||
#include "amd64-cpuid.h"
|
||||
|
||||
int X(have_simd_kcvi)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
if (!init) {
|
||||
res = 1;
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#else /* 32-bit code */
|
||||
|
||||
#error "KCvi is 64 bits only"
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
76
fftw-3.3.10/simd-support/neon.c
Normal file
76
fftw-3.3.10/simd-support/neon.c
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
|
||||
/* check for an environment where signals are known to work */
|
||||
#if defined(unix) || defined(linux)
|
||||
# include <signal.h>
|
||||
# include <setjmp.h>
|
||||
|
||||
static jmp_buf jb;
|
||||
|
||||
static void sighandler(int x)
|
||||
{
|
||||
UNUSED(x);
|
||||
longjmp(jb, 1);
|
||||
}
|
||||
|
||||
static int really_have_neon(void)
|
||||
{
|
||||
void (*oldsig)(int);
|
||||
oldsig = signal(SIGILL, sighandler);
|
||||
if (setjmp(jb)) {
|
||||
signal(SIGILL, oldsig);
|
||||
return 0;
|
||||
} else {
|
||||
/* paranoia: encode the instruction in binary because the
|
||||
assembler may not recognize it without -mfpu=neon */
|
||||
/*asm volatile ("vand q0, q0, q0");*/
|
||||
asm volatile (".long 0xf2000150");
|
||||
signal(SIGILL, oldsig);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int X(have_simd_neon)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
|
||||
if (!init) {
|
||||
res = really_have_neon();
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
/* don't know how to autodetect NEON; assume it is present */
|
||||
int X(have_simd_neon)(void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
297
fftw-3.3.10/simd-support/simd-altivec.h
Normal file
297
fftw-3.3.10/simd-support/simd-altivec.h
Normal file
@@ -0,0 +1,297 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef FFTW_SINGLE
|
||||
#error "ALTIVEC only works in single precision"
|
||||
#endif
|
||||
|
||||
/* define these unconditionally, because they are used by
|
||||
taint.c which is compiled without altivec */
|
||||
#define SIMD_SUFFIX _altivec /* for renaming */
|
||||
#define VL 2 /* SIMD complex vector length */
|
||||
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA
|
||||
|
||||
#if !defined(__VEC__) && !defined(FAKE__VEC__)
|
||||
# error "compiling simd-altivec.h requires -maltivec or equivalent"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ALTIVEC_H
|
||||
# include <altivec.h>
|
||||
#endif
|
||||
|
||||
typedef vector float V;
|
||||
#define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
|
||||
#define LDK(x) x
|
||||
#define DVK(var, val) const V var = VLIT(val, val, val, val)
|
||||
|
||||
static inline V VADD(V a, V b) { return vec_add(a, b); }
|
||||
static inline V VSUB(V a, V b) { return vec_sub(a, b); }
|
||||
static inline V VFMA(V a, V b, V c) { return vec_madd(a, b, c); }
|
||||
static inline V VFNMS(V a, V b, V c) { return vec_nmsub(a, b, c); }
|
||||
|
||||
static inline V VMUL(V a, V b)
|
||||
{
|
||||
DVK(zero, -0.0);
|
||||
return VFMA(a, b, zero);
|
||||
}
|
||||
|
||||
static inline V VFMS(V a, V b, V c) { return VSUB(VMUL(a, b), c); }
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
UNUSED(ivs);
|
||||
UNUSED(aligned_like);
|
||||
return vec_ld(0, x);
|
||||
}
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
/* common subexpressions */
|
||||
const INT fivs = sizeof(R) * ivs;
|
||||
/* you are not expected to understand this: */
|
||||
const vector unsigned int perm = VLIT(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
|
||||
vector unsigned char ml = vec_lvsr(fivs + 8, aligned_like);
|
||||
vector unsigned char mh = vec_lvsl(0, aligned_like);
|
||||
vector unsigned char msk =
|
||||
(vector unsigned char)vec_sel((V)mh, (V)ml, perm);
|
||||
/* end of common subexpressions */
|
||||
|
||||
return vec_perm(vec_ld(0, x), vec_ld(fivs, x), msk);
|
||||
}
|
||||
|
||||
/* store lower half */
|
||||
static inline void STH(R *x, V v, R *aligned_like)
|
||||
{
|
||||
v = vec_perm(v, v, vec_lvsr(0, aligned_like));
|
||||
vec_ste(v, 0, x);
|
||||
vec_ste(v, sizeof(R), x);
|
||||
}
|
||||
|
||||
static inline void STL(R *x, V v, INT ovs, R *aligned_like)
|
||||
{
|
||||
const INT fovs = sizeof(R) * ovs;
|
||||
v = vec_perm(v, v, vec_lvsr(fovs + 8, aligned_like));
|
||||
vec_ste(v, fovs, x);
|
||||
vec_ste(v, sizeof(R) + fovs, x);
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, R *aligned_like)
|
||||
{
|
||||
UNUSED(ovs);
|
||||
UNUSED(aligned_like);
|
||||
vec_st(v, 0, x);
|
||||
}
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, R *aligned_like)
|
||||
{
|
||||
/* WARNING: the extra_iter hack depends upon STH occurring after
|
||||
STL */
|
||||
STL(x, v, ovs, aligned_like);
|
||||
STH(x, v, aligned_like);
|
||||
}
|
||||
|
||||
#define STM2(x, v, ovs, aligned_like) /* no-op */
|
||||
|
||||
static inline void STN2(R *x, V v0, V v1, INT ovs)
|
||||
{
|
||||
const INT fovs = sizeof(R) * ovs;
|
||||
const vector unsigned int even =
|
||||
VLIT(0x00010203, 0x04050607, 0x10111213, 0x14151617);
|
||||
const vector unsigned int odd =
|
||||
VLIT(0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f);
|
||||
vec_st(vec_perm(v0, v1, (vector unsigned char)even), 0, x);
|
||||
vec_st(vec_perm(v0, v1, (vector unsigned char)odd), fovs, x);
|
||||
}
|
||||
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
|
||||
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
|
||||
{
|
||||
const INT fovs = sizeof(R) * ovs;
|
||||
V x0 = vec_mergeh(v0, v2);
|
||||
V x1 = vec_mergel(v0, v2);
|
||||
V x2 = vec_mergeh(v1, v3);
|
||||
V x3 = vec_mergel(v1, v3);
|
||||
V y0 = vec_mergeh(x0, x2);
|
||||
V y1 = vec_mergel(x0, x2);
|
||||
V y2 = vec_mergeh(x1, x3);
|
||||
V y3 = vec_mergel(x1, x3);
|
||||
vec_st(y0, 0, x);
|
||||
vec_st(y1, fovs, x);
|
||||
vec_st(y2, 2 * fovs, x);
|
||||
vec_st(y3, 3 * fovs, x);
|
||||
}
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
const vector unsigned int perm =
|
||||
VLIT(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
|
||||
return vec_perm(x, x, (vector unsigned char)perm);
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
const V pmpm = VLIT(0.0, -0.0, 0.0, -0.0);
|
||||
return vec_xor(x, pmpm);
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
static inline V VFMAI(V b, V c)
|
||||
{
|
||||
const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
|
||||
return VFMA(FLIP_RI(b), mpmp, c);
|
||||
}
|
||||
|
||||
static inline V VFNMSI(V b, V c)
|
||||
{
|
||||
const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
|
||||
return VFNMS(FLIP_RI(b), mpmp, c);
|
||||
}
|
||||
|
||||
static inline V VFMACONJ(V b, V c)
|
||||
{
|
||||
const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
|
||||
return VFMA(b, pmpm, c);
|
||||
}
|
||||
|
||||
static inline V VFNMSCONJ(V b, V c)
|
||||
{
|
||||
const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
|
||||
return VFNMS(b, pmpm, c);
|
||||
}
|
||||
|
||||
static inline V VFMSCONJ(V b, V c)
|
||||
{
|
||||
return VSUB(VCONJ(b), c);
|
||||
}
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
const vector unsigned int real =
|
||||
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
||||
const vector unsigned int imag =
|
||||
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
||||
V si = VBYI(sr);
|
||||
V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
||||
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
||||
return VFMA(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
const vector unsigned int real =
|
||||
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
||||
const vector unsigned int imag =
|
||||
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
||||
V si = VBYI(sr);
|
||||
V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
||||
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V si)
|
||||
{
|
||||
const vector unsigned int real =
|
||||
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
||||
const vector unsigned int imag =
|
||||
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
||||
V sr = VBYI(si);
|
||||
V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
||||
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V si)
|
||||
{
|
||||
const vector unsigned int real =
|
||||
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
||||
const vector unsigned int imag =
|
||||
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
||||
V sr = VBYI(si);
|
||||
V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
||||
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
||||
return VFMA(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#define VTW1(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#define TWVL1 (VL)
|
||||
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = VBYI(sr);
|
||||
V tx = twp[0];
|
||||
V tr = vec_mergeh(tx, tx);
|
||||
V ti = vec_mergel(tx, tx);
|
||||
return VFMA(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = VBYI(sr);
|
||||
V tx = twp[0];
|
||||
V tr = vec_mergeh(tx, tx);
|
||||
V ti = vec_mergel(tx, tx);
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
#define TWVL3 (VL)
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
332
fftw-3.3.10/simd-support/simd-avx-128-fma.h
Normal file
332
fftw-3.3.10/simd-support/simd-avx-128-fma.h
Normal file
@@ -0,0 +1,332 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* 128-bit AVX support by Erik Lindahl, 2015.
|
||||
* Erik Lindahl hereby places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "AVX only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## s
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## d
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _avx_128_fma /* for renaming */
|
||||
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#elif defined (__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(__AVX__) && defined(__FMA4__)) /* sanity check */
|
||||
#error "compiling simd-avx-128-fma.h without -mavx or -mfma4"
|
||||
#endif
|
||||
|
||||
typedef DS(__m128d,__m128) V;
|
||||
#define VADD SUFF(_mm_add_p)
|
||||
#define VSUB SUFF(_mm_sub_p)
|
||||
#define VMUL SUFF(_mm_mul_p)
|
||||
#define VXOR SUFF(_mm_xor_p)
|
||||
#define SHUF SUFF(_mm_shuffle_p)
|
||||
#define VPERM1 SUFF(_mm_permute_p)
|
||||
#define UNPCKL SUFF(_mm_unpacklo_p)
|
||||
#define UNPCKH SUFF(_mm_unpackhi_p)
|
||||
|
||||
#define SHUFVALS(fp0,fp1,fp2,fp3) \
|
||||
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
|
||||
|
||||
#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))
|
||||
#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))
|
||||
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
|
||||
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
|
||||
#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
|
||||
#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
|
||||
|
||||
#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))
|
||||
#define DVK(var, val) V var = VLIT(val, val)
|
||||
#define LDK(x) x
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return *(const V *)x;
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
*(V *)x = v;
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
V var;
|
||||
#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
|
||||
var = LOADL(x, SUFF(_mm_undefined_p)());
|
||||
var = LOADH(x + ivs, var);
|
||||
#else
|
||||
var = LOADL(x, var);
|
||||
var = LOADH(x + ivs, var);
|
||||
#endif
|
||||
return var;
|
||||
}
|
||||
|
||||
# ifdef _MSC_VER
|
||||
# pragma warning(default : 4700)
|
||||
# pragma runtime_checks("u", restore)
|
||||
# endif
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon STOREL occurring
|
||||
after STOREH */
|
||||
STOREH(x + ovs, v);
|
||||
STOREL(x, v);
|
||||
}
|
||||
|
||||
#else /* ! FFTW_SINGLE */
|
||||
# define LD LDA
|
||||
# define ST STA
|
||||
#endif
|
||||
|
||||
#define STM2 DS(STA,ST)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
/* STN4 is a macro, not a function, thanks to Visual C++ developers
|
||||
deciding "it would be infrequent that people would want to pass more
|
||||
than 3 [__m128 parameters] by value." 3 parameters ought to be enough
|
||||
for anybody. */
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
xxx0 = UNPCKL(v0, v2); \
|
||||
xxx1 = UNPCKH(v0, v2); \
|
||||
xxx2 = UNPCKL(v1, v3); \
|
||||
xxx3 = UNPCKH(v1, v3); \
|
||||
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
|
||||
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
|
||||
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
|
||||
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
STOREL(x, v);
|
||||
STOREH(x + ovs, v);
|
||||
}
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));
|
||||
}
|
||||
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
/* Produce a SIMD vector[VL] of (0 + -0i).
|
||||
|
||||
We really want to write this:
|
||||
|
||||
V pmpm = VLIT(-0.0, 0.0);
|
||||
|
||||
but historically some compilers have ignored the distiction
|
||||
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
|
||||
as 0 too.
|
||||
*/
|
||||
union uvec {
|
||||
unsigned u[4];
|
||||
V v;
|
||||
};
|
||||
static const union uvec pmpm = {
|
||||
#ifdef FFTW_SINGLE
|
||||
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
|
||||
#else
|
||||
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
|
||||
#endif
|
||||
};
|
||||
return VXOR(pmpm.v, x);
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
x = VCONJ(x);
|
||||
x = FLIP_RI(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) SUFF(_mm_macc_p)(a,b,c)
|
||||
#define VFNMS(a, b, c) SUFF(_mm_nmacc_p)(a,b,c)
|
||||
#define VFMS(a, b, c) SUFF(_mm_msub_p)(a,b,c)
|
||||
#define VFMAI(b, c) SUFF(_mm_addsub_p)(c,FLIP_RI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(tr, sr);
|
||||
ti = VMUL(ti, FLIP_RI(sr));
|
||||
return SUFF(_mm_addsub_p)(tr,ti);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(tr, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
tr = VMUL(tr, FLIP_RI(sr));
|
||||
return SUFF(_mm_addsub_p)(ti,tr);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V tx = twp[0];
|
||||
V tr = UNPCKL(tx, tx);
|
||||
V ti = UNPCKH(tx, tx);
|
||||
tr = VMUL(tr, sr);
|
||||
ti = VMUL(ti, FLIP_RI(sr));
|
||||
return SUFF(_mm_addsub_p)(tr,ti);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V tx = twp[0];
|
||||
V tr = UNPCKL(tx, tx);
|
||||
V ti = UNPCKH(tx, tx);
|
||||
tr = VMUL(tr, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMUL(tx, sr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMULJ(tx, sr);
|
||||
}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
# define TWVL3 (VL)
|
||||
#else
|
||||
# define VTW3(v,x) VTW1(v,x)
|
||||
# define TWVL3 TWVL1
|
||||
#endif
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
404
fftw-3.3.10/simd-support/simd-avx.h
Normal file
404
fftw-3.3.10/simd-support/simd-avx.h
Normal file
@@ -0,0 +1,404 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "AVX only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## s
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## d
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _avx /* for renaming */
|
||||
#define VL DS(2, 4) /* SIMD complex vector length */
|
||||
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#if defined(__GNUC__) && !defined(__AVX__) /* sanity check */
|
||||
#error "compiling simd-avx.h without -mavx"
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef DS(__m256d, __m256) V;
|
||||
#define VADD SUFF(_mm256_add_p)
|
||||
#define VSUB SUFF(_mm256_sub_p)
|
||||
#define VMUL SUFF(_mm256_mul_p)
|
||||
#define VXOR SUFF(_mm256_xor_p)
|
||||
#define VSHUF SUFF(_mm256_shuffle_p)
|
||||
|
||||
#define SHUFVALD(fp0,fp1) \
|
||||
(((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
|
||||
#define SHUFVALS(fp0,fp1,fp2,fp3) \
|
||||
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
|
||||
|
||||
#define VDUPL(x) DS(_mm256_unpacklo_pd(x, x), VSHUF(x, x, SHUFVALS(0, 0, 2, 2)))
|
||||
#define VDUPH(x) DS(_mm256_unpackhi_pd(x, x), VSHUF(x, x, SHUFVALS(1, 1, 3, 3)))
|
||||
|
||||
#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
|
||||
#define DVK(var, val) V var = VLIT(val, val)
|
||||
#define LDK(x) x
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return SUFF(_mm256_loadu_p)(x);
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
SUFF(_mm256_storeu_p)(x, v);
|
||||
}
|
||||
|
||||
#if FFTW_SINGLE
|
||||
|
||||
# ifdef _MSC_VER
|
||||
/* Temporarily disable the warning "uninitialized local variable
|
||||
'name' used" and runtime checks for using a variable before it is
|
||||
defined which is erroneously triggered by the LOADL0 / LOADH macros
|
||||
as they only modify VAL partly each. */
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(disable : 4700)
|
||||
# pragma runtime_checks("u", off)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(disable : 592)
|
||||
# endif
|
||||
|
||||
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
|
||||
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
|
||||
#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
|
||||
#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
|
||||
|
||||
/* it seems like the only AVX way to store 4 complex floats is to
|
||||
extract two pairs of complex floats into two __m128 registers, and
|
||||
then use SSE-like half-stores. Similarly, to load 4 complex
|
||||
floats, we load two pairs of complex floats into two __m128
|
||||
registers, and then pack the two __m128 registers into one __m256
|
||||
value. */
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
__m128 l, h;
|
||||
V v;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
l = LOADL(x, l);
|
||||
l = LOADH(x + ivs, l);
|
||||
h = LOADL(x + 2*ivs, h);
|
||||
h = LOADH(x + 3*ivs, h);
|
||||
v = _mm256_castps128_ps256(l);
|
||||
v = _mm256_insertf128_ps(v, h, 1);
|
||||
return v;
|
||||
}
|
||||
|
||||
# ifdef _MSC_VER
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(default : 4700)
|
||||
# pragma runtime_checks("u", restore)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(default : 592)
|
||||
# endif
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
__m128 h = _mm256_extractf128_ps(v, 1);
|
||||
__m128 l = _mm256_castps256_ps128(v);
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon STOREL occurring
|
||||
after STOREH */
|
||||
STOREH(x + 3*ovs, h);
|
||||
STOREL(x + 2*ovs, h);
|
||||
STOREH(x + ovs, l);
|
||||
STOREL(x, l);
|
||||
}
|
||||
|
||||
#define STM2(x, v, ovs, aligned_like) /* no-op */
|
||||
static inline void STN2(R *x, V v0, V v1, INT ovs)
|
||||
{
|
||||
V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
|
||||
V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
|
||||
__m128 h0 = _mm256_extractf128_ps(x0, 1);
|
||||
__m128 l0 = _mm256_castps256_ps128(x0);
|
||||
__m128 h1 = _mm256_extractf128_ps(x1, 1);
|
||||
__m128 l1 = _mm256_castps256_ps128(x1);
|
||||
|
||||
*(__m128 *)(x + 3*ovs) = h1;
|
||||
*(__m128 *)(x + 2*ovs) = h0;
|
||||
*(__m128 *)(x + 1*ovs) = l1;
|
||||
*(__m128 *)(x + 0*ovs) = l0;
|
||||
}
|
||||
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
V yyy0, yyy1, yyy2, yyy3; \
|
||||
xxx0 = _mm256_unpacklo_ps(v0, v2); \
|
||||
xxx1 = _mm256_unpackhi_ps(v0, v2); \
|
||||
xxx2 = _mm256_unpacklo_ps(v1, v3); \
|
||||
xxx3 = _mm256_unpackhi_ps(v1, v3); \
|
||||
yyy0 = _mm256_unpacklo_ps(xxx0, xxx2); \
|
||||
yyy1 = _mm256_unpackhi_ps(xxx0, xxx2); \
|
||||
yyy2 = _mm256_unpacklo_ps(xxx1, xxx3); \
|
||||
yyy3 = _mm256_unpackhi_ps(xxx1, xxx3); \
|
||||
*(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0); \
|
||||
*(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1); \
|
||||
*(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1); \
|
||||
*(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1); \
|
||||
*(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2); \
|
||||
*(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1); \
|
||||
*(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3); \
|
||||
*(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1); \
|
||||
}
|
||||
|
||||
#else
|
||||
static inline __m128d VMOVAPD_LD(const R *x)
|
||||
{
|
||||
/* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
|
||||
into a 256-bit vmovapd, which requires 32-byte aligment instead of
|
||||
16-byte alignment.
|
||||
|
||||
Force the use of vmovapd via asm until compilers stabilize.
|
||||
*/
|
||||
#if defined(__GNUC__)
|
||||
__m128d var;
|
||||
__asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
|
||||
return var;
|
||||
#else
|
||||
return *(const __m128d *)x;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
V var;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
|
||||
var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
|
||||
return var;
|
||||
}
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon the store of the low
|
||||
part occurring after the store of the high part */
|
||||
*(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
|
||||
*(__m128d *)x = _mm256_castpd256_pd128(v);
|
||||
}
|
||||
|
||||
|
||||
#define STM2 ST
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
|
||||
/* STN4 is a macro, not a function, thanks to Visual C++ developers
|
||||
deciding "it would be infrequent that people would want to pass more
|
||||
than 3 [__m128 parameters] by value." Even though the comment
|
||||
was made about __m128 parameters, it appears to apply to __m256
|
||||
parameters as well. */
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
xxx0 = _mm256_unpacklo_pd(v0, v1); \
|
||||
xxx1 = _mm256_unpackhi_pd(v0, v1); \
|
||||
xxx2 = _mm256_unpacklo_pd(v2, v3); \
|
||||
xxx3 = _mm256_unpackhi_pd(v2, v3); \
|
||||
STA(x, _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
|
||||
STA(x + ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
|
||||
STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
|
||||
STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return VSHUF(x, x,
|
||||
DS(SHUFVALD(1, 0),
|
||||
SHUFVALS(1, 0, 3, 2)));
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
/* Produce a SIMD vector[VL] of (0 + -0i).
|
||||
|
||||
We really want to write this:
|
||||
|
||||
V pmpm = VLIT(-0.0, 0.0);
|
||||
|
||||
but historically some compilers have ignored the distiction
|
||||
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
|
||||
as 0 too.
|
||||
*/
|
||||
union uvec {
|
||||
unsigned u[8];
|
||||
V v;
|
||||
};
|
||||
static const union uvec pmpm = {
|
||||
#ifdef FFTW_SINGLE
|
||||
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000,
|
||||
0x00000000, 0x80000000, 0x00000000, 0x80000000 }
|
||||
#else
|
||||
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x80000000 }
|
||||
#endif
|
||||
};
|
||||
return VXOR(pmpm.v, x);
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
|
||||
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
|
||||
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
|
||||
#define VFMAI(b, c) VADD(c, VBYI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
|
||||
#else
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
return VZMUL(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
return VZMULJ(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#define VTW3 VTW1
|
||||
#define TWVL3 TWVL1
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
|
||||
/* Use VZEROUPPER to avoid the penalty of switching from AVX to SSE.
|
||||
See Intel Optimization Manual (April 2011, version 248966), Section
|
||||
11.3 */
|
||||
#define VLEAVE _mm256_zeroupper
|
||||
|
||||
#include "simd-common.h"
|
||||
342
fftw-3.3.10/simd-support/simd-avx2-128.h
Normal file
342
fftw-3.3.10/simd-support/simd-avx2-128.h
Normal file
@@ -0,0 +1,342 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* 128-bit AVX2 support by Erik Lindahl, 2015.
|
||||
* Erik Lindahl hereby places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "AVX2 only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## s
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## d
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _avx2_128 /* for renaming */
|
||||
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#if defined(__GNUC__) && !defined(__AVX2__) /* sanity check */
|
||||
#error "compiling simd-avx2-128.h without avx2 support"
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef DS(__m128d,__m128) V;
|
||||
#define VADD SUFF(_mm_add_p)
|
||||
#define VSUB SUFF(_mm_sub_p)
|
||||
#define VMUL SUFF(_mm_mul_p)
|
||||
#define VXOR SUFF(_mm_xor_p)
|
||||
#define SHUF SUFF(_mm_shuffle_p)
|
||||
#define VPERM1 SUFF(_mm_permute_p)
|
||||
#define UNPCKL SUFF(_mm_unpacklo_p)
|
||||
#define UNPCKH SUFF(_mm_unpackhi_p)
|
||||
|
||||
#define SHUFVALS(fp0,fp1,fp2,fp3) \
|
||||
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
|
||||
|
||||
#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))
|
||||
#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))
|
||||
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
|
||||
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
|
||||
#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
|
||||
#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
|
||||
|
||||
#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))
|
||||
#define DVK(var, val) V var = VLIT(val, val)
|
||||
#define LDK(x) x
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return *(const V *)x;
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
*(V *)x = v;
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
|
||||
# ifdef _MSC_VER
|
||||
/* Temporarily disable the warning "uninitialized local variable
|
||||
'name' used" and runtime checks for using a variable before it is
|
||||
defined which is erroneously triggered by the LOADL0 / LOADH macros
|
||||
as they only modify VAL partly each. */
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(disable : 4700)
|
||||
# pragma runtime_checks("u", off)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(disable : 592)
|
||||
# endif
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
__m128 l0, l1;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
|
||||
l0 = LOADL(x, SUFF(_mm_undefined_p)());
|
||||
l1 = LOADL(x + ivs, SUFF(_mm_undefined_p)());
|
||||
#else
|
||||
l0 = LOADL(x, l0);
|
||||
l1 = LOADL(x + ivs, l1);
|
||||
#endif
|
||||
return SUFF(_mm_movelh_p)(l0,l1);
|
||||
}
|
||||
|
||||
# ifdef _MSC_VER
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(default : 4700)
|
||||
# pragma runtime_checks("u", restore)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(default : 592)
|
||||
# endif
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon STOREL occurring
|
||||
after STOREH */
|
||||
STOREH(x + ovs, v);
|
||||
STOREL(x, v);
|
||||
}
|
||||
|
||||
#else /* ! FFTW_SINGLE */
|
||||
# define LD LDA
|
||||
# define ST STA
|
||||
#endif
|
||||
|
||||
#define STM2 DS(STA,ST)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
/* STN4 is a macro, not a function, thanks to Visual C++ developers
|
||||
deciding "it would be infrequent that people would want to pass more
|
||||
than 3 [__m128 parameters] by value." 3 parameters ought to be enough
|
||||
for anybody. */
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
xxx0 = UNPCKL(v0, v2); \
|
||||
xxx1 = UNPCKH(v0, v2); \
|
||||
xxx2 = UNPCKL(v1, v3); \
|
||||
xxx3 = UNPCKH(v1, v3); \
|
||||
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
|
||||
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
|
||||
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
|
||||
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
STOREL(x, v);
|
||||
STOREH(x + ovs, v);
|
||||
}
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
/* Produce a SIMD vector[VL] of (0 + -0i).
|
||||
|
||||
We really want to write this:
|
||||
|
||||
V pmpm = VLIT(-0.0, 0.0);
|
||||
|
||||
but historically some compilers have ignored the distiction
|
||||
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
|
||||
as 0 too.
|
||||
*/
|
||||
union uvec {
|
||||
unsigned u[4];
|
||||
V v;
|
||||
};
|
||||
static const union uvec pmpm = {
|
||||
#ifdef FFTW_SINGLE
|
||||
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
|
||||
#else
|
||||
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
|
||||
#endif
|
||||
};
|
||||
return VXOR(pmpm.v, x);
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
x = VCONJ(x);
|
||||
x = FLIP_RI(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) SUFF(_mm_fmadd_p)(a,b,c)
|
||||
#define VFNMS(a, b, c) SUFF(_mm_fnmadd_p)(a,b,c)
|
||||
#define VFMS(a, b, c) SUFF(_mm_fmsub_p)(a,b,c)
|
||||
#define VFMAI(b, c) SUFF(_mm_addsub_p)(c,FLIP_RI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)
|
||||
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, FLIP_RI(sr));
|
||||
return SUFF(_mm_fmaddsub_p)(tr,sr,ti);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, FLIP_RI(sr));
|
||||
return SUFF(_mm_fmsubadd_p)(tr,sr,ti);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(tr, FLIP_RI(sr));
|
||||
return SUFF(_mm_fmaddsub_p)(ti,sr,tr);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V tx = twp[0];
|
||||
V tr = UNPCKL(tx, tx);
|
||||
V ti = UNPCKH(tx, tx);
|
||||
ti = VMUL(ti, FLIP_RI(sr));
|
||||
return SUFF(_mm_fmaddsub_p)(tr,sr,ti);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V tx = twp[0];
|
||||
V tr = UNPCKL(tx, tx);
|
||||
V ti = UNPCKH(tx, tx);
|
||||
ti = VMUL(ti, FLIP_RI(sr));
|
||||
return SUFF(_mm_fmsubadd_p)(tr,sr,ti);
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMUL(tx, sr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMULJ(tx, sr);
|
||||
}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
# define TWVL3 (VL)
|
||||
#else
|
||||
# define VTW3(v,x) VTW1(v,x)
|
||||
# define TWVL3 TWVL1
|
||||
#endif
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
414
fftw-3.3.10/simd-support/simd-avx2.h
Normal file
414
fftw-3.3.10/simd-support/simd-avx2.h
Normal file
@@ -0,0 +1,414 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* Modifications by Romain Dolbeau & Erik Lindahl, derived from simd-avx.h
|
||||
* Romain Dolbeau hereby places his modifications in the public domain.
|
||||
* Erik Lindahl hereby places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "AVX2 only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## s
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## d
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _avx2 /* for renaming */
|
||||
#define VL DS(2, 4) /* SIMD complex vector length */
|
||||
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#if defined(__GNUC__) && !defined(__AVX2__) /* sanity check */
|
||||
#error "compiling simd-avx2.h without avx2 support"
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef DS(__m256d, __m256) V;
|
||||
#define VADD SUFF(_mm256_add_p)
|
||||
#define VSUB SUFF(_mm256_sub_p)
|
||||
#define VMUL SUFF(_mm256_mul_p)
|
||||
#define VXOR SUFF(_mm256_xor_p)
|
||||
#define VSHUF SUFF(_mm256_shuffle_p)
|
||||
#define VPERM1 SUFF(_mm256_permute_p)
|
||||
|
||||
#define SHUFVALD(fp0,fp1) \
|
||||
(((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
|
||||
#define SHUFVALS(fp0,fp1,fp2,fp3) \
|
||||
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
|
||||
|
||||
#define VDUPL(x) DS(_mm256_movedup_pd(x), _mm256_moveldup_ps(x))
|
||||
#define VDUPH(x) DS(_mm256_permute_pd(x,SHUFVALD(1,1)), _mm256_movehdup_ps(x))
|
||||
|
||||
#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
|
||||
#define DVK(var, val) V var = VLIT(val, val)
|
||||
#define LDK(x) x
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return SUFF(_mm256_loadu_p)(x);
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
SUFF(_mm256_storeu_p)(x, v);
|
||||
}
|
||||
|
||||
#if FFTW_SINGLE
|
||||
|
||||
# ifdef _MSC_VER
|
||||
/* Temporarily disable the warning "uninitialized local variable
|
||||
'name' used" and runtime checks for using a variable before it is
|
||||
defined which is erroneously triggered by the LOADL0 / LOADH macros
|
||||
as they only modify VAL partly each. */
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(disable : 4700)
|
||||
# pragma runtime_checks("u", off)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(disable : 592)
|
||||
# endif
|
||||
|
||||
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
|
||||
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
|
||||
#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
|
||||
#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
__m128 l0, l1, h0, h1;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
|
||||
l0 = LOADL(x, SUFF(_mm_undefined_p)());
|
||||
l1 = LOADL(x + ivs, SUFF(_mm_undefined_p)());
|
||||
h0 = LOADL(x + 2*ivs, SUFF(_mm_undefined_p)());
|
||||
h1 = LOADL(x + 3*ivs, SUFF(_mm_undefined_p)());
|
||||
#else
|
||||
l0 = LOADL(x, l0);
|
||||
l1 = LOADL(x + ivs, l1);
|
||||
h0 = LOADL(x + 2*ivs, h0);
|
||||
h1 = LOADL(x + 3*ivs, h1);
|
||||
#endif
|
||||
l0 = SUFF(_mm_movelh_p)(l0,l1);
|
||||
h0 = SUFF(_mm_movelh_p)(h0,h1);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(l0), h0, 1);
|
||||
}
|
||||
|
||||
# ifdef _MSC_VER
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(default : 4700)
|
||||
# pragma runtime_checks("u", restore)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(default : 592)
|
||||
# endif
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
__m128 h = _mm256_extractf128_ps(v, 1);
|
||||
__m128 l = _mm256_castps256_ps128(v);
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon STOREL occurring
|
||||
after STOREH */
|
||||
STOREH(x + 3*ovs, h);
|
||||
STOREL(x + 2*ovs, h);
|
||||
STOREH(x + ovs, l);
|
||||
STOREL(x, l);
|
||||
}
|
||||
|
||||
#define STM2(x, v, ovs, aligned_like) /* no-op */
|
||||
static inline void STN2(R *x, V v0, V v1, INT ovs)
|
||||
{
|
||||
V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
|
||||
V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
|
||||
__m128 h0 = _mm256_extractf128_ps(x0, 1);
|
||||
__m128 l0 = _mm256_castps256_ps128(x0);
|
||||
__m128 h1 = _mm256_extractf128_ps(x1, 1);
|
||||
__m128 l1 = _mm256_castps256_ps128(x1);
|
||||
*(__m128 *)(x + 3*ovs) = h1;
|
||||
*(__m128 *)(x + 2*ovs) = h0;
|
||||
*(__m128 *)(x + 1*ovs) = l1;
|
||||
*(__m128 *)(x + 0*ovs) = l0;
|
||||
}
|
||||
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
V yyy0, yyy1, yyy2, yyy3; \
|
||||
xxx0 = _mm256_unpacklo_ps(v0, v2); \
|
||||
xxx1 = _mm256_unpackhi_ps(v0, v2); \
|
||||
xxx2 = _mm256_unpacklo_ps(v1, v3); \
|
||||
xxx3 = _mm256_unpackhi_ps(v1, v3); \
|
||||
yyy0 = _mm256_unpacklo_ps(xxx0, xxx2); \
|
||||
yyy1 = _mm256_unpackhi_ps(xxx0, xxx2); \
|
||||
yyy2 = _mm256_unpacklo_ps(xxx1, xxx3); \
|
||||
yyy3 = _mm256_unpackhi_ps(xxx1, xxx3); \
|
||||
*(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0); \
|
||||
*(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1); \
|
||||
*(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1); \
|
||||
*(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1); \
|
||||
*(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2); \
|
||||
*(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1); \
|
||||
*(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3); \
|
||||
*(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1); \
|
||||
}
|
||||
|
||||
#else
|
||||
static inline __m128d VMOVAPD_LD(const R *x)
|
||||
{
|
||||
/* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
|
||||
into a 256-bit vmovapd, which requires 32-byte aligment instead of
|
||||
16-byte alignment.
|
||||
|
||||
Force the use of vmovapd via asm until compilers stabilize.
|
||||
*/
|
||||
#if defined(__GNUC__)
|
||||
__m128d var;
|
||||
__asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
|
||||
return var;
|
||||
#else
|
||||
return *(const __m128d *)x;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
V var;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
|
||||
var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
|
||||
return var;
|
||||
}
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon the store of the low
|
||||
part occurring after the store of the high part */
|
||||
*(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
|
||||
*(__m128d *)x = _mm256_castpd256_pd128(v);
|
||||
}
|
||||
|
||||
|
||||
#define STM2 ST
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
|
||||
/* STN4 is a macro, not a function, thanks to Visual C++ developers
|
||||
deciding "it would be infrequent that people would want to pass more
|
||||
than 3 [__m128 parameters] by value." Even though the comment
|
||||
was made about __m128 parameters, it appears to apply to __m256
|
||||
parameters as well. */
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
xxx0 = _mm256_unpacklo_pd(v0, v1); \
|
||||
xxx1 = _mm256_unpackhi_pd(v0, v1); \
|
||||
xxx2 = _mm256_unpacklo_pd(v2, v3); \
|
||||
xxx3 = _mm256_unpackhi_pd(v2, v3); \
|
||||
STA(x, _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
|
||||
STA(x + ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
|
||||
STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
|
||||
STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return VPERM1(x, DS(SHUFVALD(1, 0), SHUFVALS(1, 0, 3, 2)));
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
/* Produce a SIMD vector[VL] of (0 + -0i).
|
||||
|
||||
We really want to write this:
|
||||
|
||||
V pmpm = VLIT(-0.0, 0.0);
|
||||
|
||||
but historically some compilers have ignored the distiction
|
||||
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
|
||||
as 0 too.
|
||||
*/
|
||||
union uvec {
|
||||
unsigned u[8];
|
||||
V v;
|
||||
};
|
||||
static const union uvec pmpm = {
|
||||
#ifdef FFTW_SINGLE
|
||||
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000,
|
||||
0x00000000, 0x80000000, 0x00000000, 0x80000000 }
|
||||
#else
|
||||
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x80000000 }
|
||||
#endif
|
||||
};
|
||||
return VXOR(pmpm.v, x);
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA SUFF(_mm256_fmadd_p)
|
||||
#define VFNMS SUFF(_mm256_fnmadd_p)
|
||||
#define VFMS SUFF(_mm256_fmsub_p)
|
||||
#define VFMAI(b, c) SUFF(_mm256_addsub_p)(c, FLIP_RI(b)) /* VADD(c, VBYI(b)) */
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) SUFF(_mm256_addsub_p)(c, b) /* VSUB(c, VCONJ(b)) */
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
/* V tr = VDUPL(tx); */
|
||||
/* V ti = VDUPH(tx); */
|
||||
/* tr = VMUL(sr, tr); */
|
||||
/* sr = VBYI(sr); */
|
||||
/* return VFMA(ti, sr, tr); */
|
||||
return SUFF(_mm256_fmaddsub_p)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
/* V tr = VDUPL(tx); */
|
||||
/* V ti = VDUPH(tx); */
|
||||
/* tr = VMUL(sr, tr); */
|
||||
/* sr = VBYI(sr); */
|
||||
/* return VFNMS(ti, sr, tr); */
|
||||
return SUFF(_mm256_fmsubadd_p)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
/*
|
||||
* Keep the old version
|
||||
* (2 permute, 1 shuffle, 1 constant load (L1), 1 xor, 2 fp), since the below FMA one
|
||||
* would be 2 permute, 1 shuffle, 1 xor (setzero), 3 fp), but with a longer pipeline.
|
||||
*
|
||||
* Alternative new fma version:
|
||||
* return SUFF(_mm256_addsub_p)(SUFF(_mm256_fnmadd_p)(sr, VDUPH(tx), SUFF(_mm256_setzero_p)()),
|
||||
* VMUL(FLIP_RI(sr), VDUPL(tx)));
|
||||
*/
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
/* V tr = VDUPL(tx); */
|
||||
/* V ti = VDUPH(tx); */
|
||||
/* ti = VMUL(ti, sr); */
|
||||
/* sr = VBYI(sr); */
|
||||
/* return VFMA(tr, sr, ti); */
|
||||
return SUFF(_mm256_fmaddsub_p)(sr, VDUPH(tx), VMUL(FLIP_RI(sr), VDUPL(tx)));
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
|
||||
#else
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
return VZMUL(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
return VZMULJ(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#define VTW3 VTW1
|
||||
#define TWVL3 TWVL1
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE _mm256_zeroupper
|
||||
|
||||
#include "simd-common.h"
|
||||
316
fftw-3.3.10/simd-support/simd-avx512.h
Normal file
316
fftw-3.3.10/simd-support/simd-avx512.h
Normal file
@@ -0,0 +1,316 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-11 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
||||
*
|
||||
* AVX-512 support implemented by Romain Dolbeau.
|
||||
* Romain Dolbeau hereby places his modifications in the public domain.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "AVX-512 vector instructions only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## _ps
|
||||
# define SCAL(x) x ## f
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## _pd
|
||||
# define SCAL(x) x
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
#define SIMD_SUFFIX _avx512 /* for renaming */
|
||||
#define VL DS(4, 8) /* SIMD complex vector length */
|
||||
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#if defined(__GNUC__) && !defined(__AVX512F__) /* sanity check */
|
||||
#error "compiling simd-avx512.h without avx-512f support"
|
||||
#endif
|
||||
|
||||
#if !defined(HAVE_AVX2)
|
||||
#warning "You should probably enable AVX2 with --enable-avx2 for AVX-512"
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef DS(__m512d, __m512) V;
|
||||
|
||||
#define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
|
||||
#define VLIT1(val) SUFF(_mm512_set1)(val)
|
||||
#define LDK(x) x
|
||||
#define DVK(var, val) V var = VLIT1(val)
|
||||
#define VZERO SUFF(_mm512_setzero)()
|
||||
|
||||
#define VDUPL(x) DS(_mm512_movedup_pd(x),_mm512_moveldup_ps(x))
|
||||
#define VDUPH(x) DS(_mm512_unpackhi_pd(x, x),_mm512_movehdup_ps(x))
|
||||
#define FLIP_RI(x) SUFF(_mm512_shuffle)(x, x, DS(0x55,0xB1))
|
||||
#define VCONJ(x) SUFF(_mm512_fmsubadd)(VZERO, VZERO, x)
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
#define VADD(a,b) SUFF(_mm512_add)(a,b)
|
||||
#define VSUB(a,b) SUFF(_mm512_sub)(a,b)
|
||||
#define VMUL(a,b) SUFF(_mm512_mul)(a,b)
|
||||
#define VFMA(a, b, c) SUFF(_mm512_fmadd)(a, b, c)
|
||||
#define VFMS(a, b, c) SUFF(_mm512_fmsub)(a, b, c)
|
||||
#define VFNMS(a, b, c) SUFF(_mm512_fnmadd)(a, b, c)
|
||||
#define VFMAI(b, c) SUFF(_mm512_fmaddsub)(VLIT1(1.), c, FLIP_RI(b))
|
||||
#define VFNMSI(b, c) SUFF(_mm512_fmsubadd)(VLIT1(1.), c, FLIP_RI(b))
|
||||
#define VFMACONJ(b,c) SUFF(_mm512_fmsubadd)(VLIT1(1.), c, b)
|
||||
#define VFMSCONJ(b,c) SUFF(_mm512_fmsubadd)(VLIT1(-1.), c, b)
|
||||
#define VFNMSCONJ(b,c) SUFF(_mm512_fmaddsub)(VLIT1(1.), c, b)
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return SUFF(_mm512_loadu)(x);
|
||||
}
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
SUFF(_mm512_storeu)(x, v);
|
||||
}
|
||||
|
||||
#if FFTW_SINGLE
|
||||
|
||||
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
|
||||
6 * ivs + 1, 6 * ivs,
|
||||
5 * ivs + 1, 5 * ivs,
|
||||
4 * ivs + 1, 4 * ivs,
|
||||
3 * ivs + 1, 3 * ivs,
|
||||
2 * ivs + 1, 2 * ivs,
|
||||
1 * ivs + 1, 1 * ivs,
|
||||
0 * ivs + 1, 0 * ivs);
|
||||
|
||||
return _mm512_i32gather_ps(index, x, 4);
|
||||
}
|
||||
|
||||
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
|
||||
6 * ovs + 1, 6 * ovs,
|
||||
5 * ovs + 1, 5 * ovs,
|
||||
4 * ovs + 1, 4 * ovs,
|
||||
3 * ovs + 1, 3 * ovs,
|
||||
2 * ovs + 1, 2 * ovs,
|
||||
1 * ovs + 1, 1 * ovs,
|
||||
0 * ovs + 1, 0 * ovs);
|
||||
|
||||
_mm512_i32scatter_ps(x, index, v, 4);
|
||||
}
|
||||
|
||||
#else /* !FFTW_SINGLE */
|
||||
|
||||
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
|
||||
2 * ivs + 1, 2 * ivs,
|
||||
1 * ivs + 1, 1 * ivs,
|
||||
0 * ivs + 1, 0 * ivs);
|
||||
|
||||
return _mm512_i32gather_pd(index, x, 8);
|
||||
}
|
||||
|
||||
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
|
||||
2 * ovs + 1, 2 * ovs,
|
||||
1 * ovs + 1, 1 * ovs,
|
||||
0 * ovs + 1, 0 * ovs);
|
||||
|
||||
_mm512_i32scatter_pd(x, index, v, 8);
|
||||
}
|
||||
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
#define LD LDu
|
||||
#define ST STu
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(15 * ovs, 14 * ovs,
|
||||
13 * ovs, 12 * ovs,
|
||||
11 * ovs, 10 * ovs,
|
||||
9 * ovs, 8 * ovs,
|
||||
7 * ovs, 6 * ovs,
|
||||
5 * ovs, 4 * ovs,
|
||||
3 * ovs, 2 * ovs,
|
||||
1 * ovs, 0 * ovs);
|
||||
|
||||
_mm512_i32scatter_ps(x, index, v, 4);
|
||||
}
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
|
||||
#else /* !FFTW_SINGLE */
|
||||
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs,
|
||||
5 * ovs, 4 * ovs,
|
||||
3 * ovs, 2 * ovs,
|
||||
1 * ovs, 0 * ovs);
|
||||
|
||||
_mm512_i32scatter_pd(x, index, v, 8);
|
||||
}
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
/* V tr = VDUPL(tx); */
|
||||
/* V ti = VDUPH(tx); */
|
||||
/* tr = VMUL(sr, tr); */
|
||||
/* sr = VBYI(sr); */
|
||||
/* return VFMA(ti, sr, tr); */
|
||||
return SUFF(_mm512_fmaddsub)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
/* V tr = VDUPL(tx); */
|
||||
/* V ti = VDUPH(tx); */
|
||||
/* tr = VMUL(sr, tr); */
|
||||
/* sr = VBYI(sr); */
|
||||
/* return VFNMS(ti, sr, tr); */
|
||||
return SUFF(_mm512_fmsubadd)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
/* return SUFF(_mm512_addsub)(SUFF(_mm512_fnmadd)(sr, VDUPH(tx), VZERO), VMUL(FLIP_RI(sr), VDUPL(tx))); */
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
/* V tr = VDUPL(tx); */
|
||||
/* V ti = VDUPH(tx); */
|
||||
/* ti = VMUL(ti, sr); */
|
||||
/* sr = VBYI(sr); */
|
||||
/* return VFMA(tr, sr, ti); */
|
||||
return SUFF(_mm512_fmaddsub)(sr, VDUPH(tx), VMUL(FLIP_RI(sr), VDUPL(tx)));
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
|
||||
#endif /* FFTW_SINGLE */
|
||||
#define TWVL1 (VL)
|
||||
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
return VZMUL(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
return VZMULJ(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
|
||||
{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
|
||||
{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
|
||||
#endif /* FFTW_SINGLE */
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#define VTW3(v,x) VTW1(v,x)
|
||||
#define TWVL3 TWVL1
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v+1 , x}, {TW_COS, v+2 , x}, {TW_COS, v+3 , x}, \
|
||||
{TW_COS, v+4 , x}, {TW_COS, v+5 , x}, {TW_COS, v+6 , x}, {TW_COS, v+7 , x}, \
|
||||
{TW_COS, v+8 , x}, {TW_COS, v+9 , x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
|
||||
{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
|
||||
{TW_SIN, v , x}, {TW_SIN, v+1 , x}, {TW_SIN, v+2 , x}, {TW_SIN, v+3 , x}, \
|
||||
{TW_SIN, v+4 , x}, {TW_SIN, v+5 , x}, {TW_SIN, v+6 , x}, {TW_SIN, v+7 , x}, \
|
||||
{TW_SIN, v+8 , x}, {TW_SIN, v+9 , x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
|
||||
{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v , x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
|
||||
#endif /* FFTW_SINGLE */
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE _mm256_zeroupper
|
||||
|
||||
#include "simd-common.h"
|
||||
98
fftw-3.3.10/simd-support/simd-common.h
Normal file
98
fftw-3.3.10/simd-support/simd-common.h
Normal file
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* detection of alignment. This is complicated because a machine may
|
||||
support multiple SIMD extensions (e.g. SSE2 and AVX) but only one
|
||||
set of alignment contraints. So this alignment stuff cannot be
|
||||
defined in the SIMD header files. Rather than defining a separate
|
||||
set of "machine" header files, we just do this ugly ifdef here. */
|
||||
#if defined(HAVE_SSE2) || defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX_128_FMA) || defined(HAVE_AVX512)
|
||||
# if defined(FFTW_SINGLE)
|
||||
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
|
||||
# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */
|
||||
# else
|
||||
# define ALIGNMENT 16 /* Alignment for the LD/ST macros */
|
||||
# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */
|
||||
# endif
|
||||
#elif defined(HAVE_ALTIVEC)
|
||||
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
|
||||
# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */
|
||||
#elif defined(HAVE_NEON) || defined(HAVE_VSX)
|
||||
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
|
||||
# define ALIGNMENTA 8 /* Alignment for the LDA/STA macros */
|
||||
#elif defined(HAVE_KCVI)
|
||||
# if defined(FFTW_SINGLE)
|
||||
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
|
||||
# else
|
||||
# define ALIGNMENT 16 /* Alignment for the LD/ST macros */
|
||||
# endif
|
||||
# define ALIGNMENTA 64 /* Alignment for the LDA/STA macros */
|
||||
#elif defined(HAVE_GENERIC_SIMD256)
|
||||
# if defined(FFTW_SINGLE)
|
||||
# define ALIGNMENT 8
|
||||
# define ALIGNMENTA 32
|
||||
# else
|
||||
# define ALIGNMENT 16
|
||||
# define ALIGNMENTA 32
|
||||
# endif
|
||||
#elif defined(HAVE_GENERIC_SIMD128)
|
||||
# if defined(FFTW_SINGLE)
|
||||
# define ALIGNMENT 8
|
||||
# define ALIGNMENTA 16
|
||||
# else
|
||||
# define ALIGNMENT 16
|
||||
# define ALIGNMENTA 16
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if HAVE_SIMD
|
||||
# ifndef ALIGNMENT
|
||||
# error "ALIGNMENT not defined"
|
||||
# endif
|
||||
# ifndef ALIGNMENTA
|
||||
# error "ALIGNMENTA not defined"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* rename for precision and for SIMD extensions */
|
||||
#define XSIMD0(name, suffix) CONCAT(name, suffix)
|
||||
#define XSIMD(name) XSIMD0(X(name), SIMD_SUFFIX)
|
||||
#define XSIMD_STRING(x) x STRINGIZE(SIMD_SUFFIX)
|
||||
|
||||
/* TAINT_BIT is set if pointers are not guaranteed to be multiples of
|
||||
ALIGNMENT */
|
||||
#define TAINT_BIT 1
|
||||
|
||||
/* TAINT_BITA is set if pointers are not guaranteed to be multiples of
|
||||
ALIGNMENTA */
|
||||
#define TAINT_BITA 2
|
||||
|
||||
#define PTRINT(p) ((uintptr_t)(p))
|
||||
|
||||
#define ALIGNED(p) \
|
||||
(((PTRINT(UNTAINT(p)) % ALIGNMENT) == 0) && !(PTRINT(p) & TAINT_BIT))
|
||||
|
||||
#define ALIGNEDA(p) \
|
||||
(((PTRINT(UNTAINT(p)) % ALIGNMENTA) == 0) && !(PTRINT(p) & TAINT_BITA))
|
||||
|
||||
#define SIMD_STRIDE_OK(x) (!(((x) * sizeof(R)) % ALIGNMENT))
|
||||
#define SIMD_STRIDE_OKA(x) (!(((x) * sizeof(R)) % ALIGNMENTA))
|
||||
#define SIMD_VSTRIDE_OK SIMD_STRIDE_OK
|
||||
|
||||
288
fftw-3.3.10/simd-support/simd-generic128.h
Normal file
288
fftw-3.3.10/simd-support/simd-generic128.h
Normal file
@@ -0,0 +1,288 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* Generic128d added by Romain Dolbeau, and turned into simd-generic128.h
|
||||
* with single & double precision by Erik Lindahl.
|
||||
* Romain Dolbeau hereby places his modifications in the public domain.
|
||||
* Erik Lindahl hereby places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
# error "Generic simd128 only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _generic_simd128 /* for renaming */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define VDUPL(x) (V){x[0],x[0],x[2],x[2]}
|
||||
# define VDUPH(x) (V){x[1],x[1],x[3],x[3]}
|
||||
# define DVK(var, val) V var = {val,val,val,val}
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define VDUPL(x) (V){x[0],x[0]}
|
||||
# define VDUPH(x) (V){x[1],x[1]}
|
||||
# define DVK(var, val) V var = {val, val}
|
||||
#endif
|
||||
|
||||
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
typedef DS(double,float) V __attribute__ ((vector_size(16)));
|
||||
|
||||
#define VADD(a,b) ((a)+(b))
|
||||
#define VSUB(a,b) ((a)-(b))
|
||||
#define VMUL(a,b) ((a)*(b))
|
||||
|
||||
|
||||
#define LDK(x) x
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return *(const V *)x;
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
*(V *)x = v;
|
||||
}
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
V res;
|
||||
res[0] = x[0];
|
||||
res[1] = x[1];
|
||||
#ifdef FFTW_SINGLE
|
||||
res[2] = x[ivs];
|
||||
res[3] = x[ivs+1];
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
/* ST has to be separate due to the storage hack requiring reverse order */
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
*(x + ovs ) = v[2];
|
||||
*(x + ovs + 1) = v[3];
|
||||
*(x ) = v[0];
|
||||
*(x + 1) = v[1];
|
||||
}
|
||||
#else
|
||||
/* FFTW_DOUBLE */
|
||||
# define ST STA
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#define STM2 ST
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
|
||||
{
|
||||
*(x ) = v0[0];
|
||||
*(x + 1) = v1[0];
|
||||
*(x + 2) = v2[0];
|
||||
*(x + 3) = v3[0];
|
||||
*(x + ovs ) = v0[1];
|
||||
*(x + ovs + 1) = v1[1];
|
||||
*(x + ovs + 2) = v2[1];
|
||||
*(x + ovs + 3) = v3[1];
|
||||
*(x + 2 * ovs ) = v0[2];
|
||||
*(x + 2 * ovs + 1) = v1[2];
|
||||
*(x + 2 * ovs + 2) = v2[2];
|
||||
*(x + 2 * ovs + 3) = v3[2];
|
||||
*(x + 3 * ovs ) = v0[3];
|
||||
*(x + 3 * ovs + 1) = v1[3];
|
||||
*(x + 3 * ovs + 2) = v2[3];
|
||||
*(x + 3 * ovs + 3) = v3[3];
|
||||
}
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
|
||||
|
||||
#else
|
||||
/* FFTW_DOUBLE */
|
||||
|
||||
#define STM2 STA
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
*(x) = v[0];
|
||||
*(x+ovs) = v[1];
|
||||
}
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
||||
#endif
|
||||
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
#ifdef FFTW_SINGLE
|
||||
return (V){x[1],x[0],x[3],x[2]};
|
||||
#else
|
||||
return (V){x[1],x[0]};
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
#ifdef FFTW_SINGLE
|
||||
return (V){x[0],-x[1],x[2],-x[3]};
|
||||
#else
|
||||
return (V){x[0],-x[1]};
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
x = VCONJ(x);
|
||||
x = FLIP_RI(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
|
||||
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
|
||||
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
|
||||
#define VFMAI(b, c) VADD(c, VBYI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) \
|
||||
{TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
return VZMUL(LDA(t, 2, t), sr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
return VZMULJ(LDA(t, 2, t), sr);
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMUL(tx, sr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMULJ(tx, sr);
|
||||
}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
# define TWVL3 (VL)
|
||||
#else
|
||||
# define VTW3(v,x) VTW1(v,x)
|
||||
# define TWVL3 TWVL1
|
||||
#endif
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
333
fftw-3.3.10/simd-support/simd-generic256.h
Normal file
333
fftw-3.3.10/simd-support/simd-generic256.h
Normal file
@@ -0,0 +1,333 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-11 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
||||
*
|
||||
* Generic256d added by Romain Dolbeau, and turned into simd-generic256.h
|
||||
* with single & double precision by Erik Lindahl.
|
||||
* Romain Dolbeau hereby places his modifications in the public domain.
|
||||
* Erik Lindahl hereby places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
# error "Generic simd256 only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _generic_simd256 /* for renaming */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define VDUPL(x) {x[0],x[0],x[2],x[2],x[4],x[4],x[6],x[6]}
|
||||
# define VDUPH(x) {x[1],x[1],x[3],x[3],x[5],x[5],x[7],x[7]}
|
||||
# define DVK(var, val) V var = {val,val,val,val,val,val,val,val}
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define VDUPL(x) {x[0],x[0],x[2],x[2]}
|
||||
# define VDUPH(x) {x[1],x[1],x[3],x[3]}
|
||||
# define DVK(var, val) V var = {val, val, val, val}
|
||||
#endif
|
||||
|
||||
#define VL DS(2,4) /* SIMD vector length, in term of complex numbers */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
typedef DS(double,float) V __attribute__ ((vector_size(32)));
|
||||
|
||||
#define VADD(a,b) ((a)+(b))
|
||||
#define VSUB(a,b) ((a)-(b))
|
||||
#define VMUL(a,b) ((a)*(b))
|
||||
|
||||
#define LDK(x) x
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
V var;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
return *(const V *)x;
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
*(V *)x = v;
|
||||
}
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
V var;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
var[0] = x[0];
|
||||
var[1] = x[1];
|
||||
var[2] = x[ivs];
|
||||
var[3] = x[ivs+1];
|
||||
#ifdef FFTW_SINGLE
|
||||
var[4] = x[2*ivs];
|
||||
var[5] = x[2*ivs+1];
|
||||
var[6] = x[3*ivs];
|
||||
var[7] = x[3*ivs+1];
|
||||
#endif
|
||||
return var;
|
||||
}
|
||||
|
||||
|
||||
/* ST has to be separate due to the storage hack requiring reverse order */
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
#ifdef FFTW_SINGLE
|
||||
*(x + 3*ovs ) = v[6];
|
||||
*(x + 3*ovs + 1) = v[7];
|
||||
*(x + 2*ovs ) = v[4];
|
||||
*(x + 2*ovs + 1) = v[5];
|
||||
*(x + ovs ) = v[2];
|
||||
*(x + ovs + 1) = v[3];
|
||||
*(x ) = v[0];
|
||||
*(x + 1) = v[1];
|
||||
#else
|
||||
*(x + ovs ) = v[2];
|
||||
*(x + ovs + 1) = v[3];
|
||||
*(x ) = v[0];
|
||||
*(x + 1) = v[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#define STM2(x, v, ovs, a) /* no-op */
|
||||
static inline void STN2(R *x, V v0, V v1, INT ovs)
|
||||
{
|
||||
x[ 0] = v0[0];
|
||||
x[ 1] = v0[1];
|
||||
x[ 2] = v1[0];
|
||||
x[ 3] = v1[1];
|
||||
x[ ovs ] = v0[2];
|
||||
x[ ovs + 1] = v0[3];
|
||||
x[ ovs + 2] = v1[2];
|
||||
x[ ovs + 3] = v1[3];
|
||||
x[2*ovs ] = v0[4];
|
||||
x[2*ovs + 1] = v0[5];
|
||||
x[2*ovs + 2] = v1[4];
|
||||
x[2*ovs + 3] = v1[5];
|
||||
x[3*ovs ] = v0[6];
|
||||
x[3*ovs + 1] = v0[7];
|
||||
x[3*ovs + 2] = v1[6];
|
||||
x[3*ovs + 3] = v1[7];
|
||||
}
|
||||
|
||||
# define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
|
||||
{
|
||||
*(x ) = v0[0];
|
||||
*(x + 1) = v1[0];
|
||||
*(x + 2) = v2[0];
|
||||
*(x + 3) = v3[0];
|
||||
*(x + ovs ) = v0[1];
|
||||
*(x + ovs + 1) = v1[1];
|
||||
*(x + ovs + 2) = v2[1];
|
||||
*(x + ovs + 3) = v3[1];
|
||||
*(x + 2 * ovs ) = v0[2];
|
||||
*(x + 2 * ovs + 1) = v1[2];
|
||||
*(x + 2 * ovs + 2) = v2[2];
|
||||
*(x + 2 * ovs + 3) = v3[2];
|
||||
*(x + 3 * ovs ) = v0[3];
|
||||
*(x + 3 * ovs + 1) = v1[3];
|
||||
*(x + 3 * ovs + 2) = v2[3];
|
||||
*(x + 3 * ovs + 3) = v3[3];
|
||||
*(x + 4 * ovs ) = v0[4];
|
||||
*(x + 4 * ovs + 1) = v1[4];
|
||||
*(x + 4 * ovs + 2) = v2[4];
|
||||
*(x + 4 * ovs + 3) = v3[4];
|
||||
*(x + 5 * ovs ) = v0[5];
|
||||
*(x + 5 * ovs + 1) = v1[5];
|
||||
*(x + 5 * ovs + 2) = v2[5];
|
||||
*(x + 5 * ovs + 3) = v3[5];
|
||||
*(x + 6 * ovs ) = v0[6];
|
||||
*(x + 6 * ovs + 1) = v1[6];
|
||||
*(x + 6 * ovs + 2) = v2[6];
|
||||
*(x + 6 * ovs + 3) = v3[6];
|
||||
*(x + 7 * ovs ) = v0[7];
|
||||
*(x + 7 * ovs + 1) = v1[7];
|
||||
*(x + 7 * ovs + 2) = v2[7];
|
||||
*(x + 7 * ovs + 3) = v3[7];
|
||||
}
|
||||
|
||||
#else
|
||||
/* FFTW_DOUBLE */
|
||||
|
||||
#define STM2 ST
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
#define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
|
||||
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs) {
|
||||
*(x ) = v0[0];
|
||||
*(x + 1) = v1[0];
|
||||
*(x + 2) = v2[0];
|
||||
*(x + 3) = v3[0];
|
||||
*(x + ovs ) = v0[1];
|
||||
*(x + ovs + 1) = v1[1];
|
||||
*(x + ovs + 2) = v2[1];
|
||||
*(x + ovs + 3) = v3[1];
|
||||
*(x + 2 * ovs ) = v0[2];
|
||||
*(x + 2 * ovs + 1) = v1[2];
|
||||
*(x + 2 * ovs + 2) = v2[2];
|
||||
*(x + 2 * ovs + 3) = v3[2];
|
||||
*(x + 3 * ovs ) = v0[3];
|
||||
*(x + 3 * ovs + 1) = v1[3];
|
||||
*(x + 3 * ovs + 2) = v2[3];
|
||||
*(x + 3 * ovs + 3) = v3[3];
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
#ifdef FFTW_SINGLE
|
||||
return (V){x[1],x[0],x[3],x[2],x[5],x[4],x[7],x[6]};
|
||||
#else
|
||||
return (V){x[1],x[0],x[3],x[2]};
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
#ifdef FFTW_SINGLE
|
||||
return (x * (V){1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0});
|
||||
#else
|
||||
return (x * (V){1.0,-1.0,1.0,-1.0});
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
|
||||
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
|
||||
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
|
||||
#define VFMAI(b, c) VADD(c, VBYI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
|
||||
#else
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
return VZMUL(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
return VZMULJ(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#define VTW3 VTW1
|
||||
#define TWVL3 TWVL1
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
461
fftw-3.3.10/simd-support/simd-kcvi.h
Normal file
461
fftw-3.3.10/simd-support/simd-kcvi.h
Normal file
@@ -0,0 +1,461 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-11 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
||||
*
|
||||
* Knights Corner Vector Instruction support added by Romain Dolbeau.
|
||||
* Romain Dolbeau hereby places his modifications in the public domain.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "Knights Corner vector instructions only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## _ps
|
||||
# define SCAL(x) x ## f
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## _pd
|
||||
# define SCAL(x) x
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
#define SIMD_SUFFIX _kcvi /* for renaming */
|
||||
#define VL DS(4, 8) /* SIMD complex vector length */
|
||||
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
/* configuration ; KNF 0 0 0 1 0 1 */
|
||||
#define KCVI_VBYI_SINGLE_USE_MUL 0
|
||||
#define KCVI_VBYI_DOUBLE_USE_MUL 0
|
||||
#define KCVI_LD_DOUBLE_USE_UNPACK 1
|
||||
#define KCVI_ST_DOUBLE_USE_PACK 1
|
||||
#define KCVI_ST2_DOUBLE_USE_STN2 0
|
||||
#define KCVI_MULZ_USE_SWIZZLE 1
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef DS(__m512d, __m512) V;
|
||||
|
||||
#define VADD(a,b) SUFF(_mm512_add)(a,b)
|
||||
#define VSUB(a,b) SUFF(_mm512_sub)(a,b)
|
||||
#define VMUL(a,b) SUFF(_mm512_mul)(a,b)
|
||||
|
||||
#define VFMA(a, b, c) SUFF(_mm512_fmadd)(a, b, c) //VADD(c, VMUL(a, b))
|
||||
#define VFMS(a, b, c) SUFF(_mm512_fmsub)(a, b, c) //VSUB(VMUL(a, b), c)
|
||||
#define VFNMS(a, b, c) SUFF(_mm512_fnmadd)(a, b, c) //VSUB(c, VMUL(a, b))
|
||||
|
||||
#define LDK(x) x
|
||||
#define VLIT(re, im) SUFF(_mm512_setr4)(im, re, im, re)
|
||||
#define DVK(var, val) V var = SUFF(_mm512_set1)(val)
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
|
||||
return SUFF(_mm512_load)(x);
|
||||
}
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
|
||||
SUFF(_mm512_store)(x, v);
|
||||
}
|
||||
|
||||
#if FFTW_SINGLE
|
||||
#define VXOR(a,b) _mm512_xor_epi32(a,b)
|
||||
|
||||
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
|
||||
6 * ivs + 1, 6 * ivs,
|
||||
5 * ivs + 1, 5 * ivs,
|
||||
4 * ivs + 1, 4 * ivs,
|
||||
3 * ivs + 1, 3 * ivs,
|
||||
2 * ivs + 1, 2 * ivs,
|
||||
1 * ivs + 1, 1 * ivs,
|
||||
0 * ivs + 1, 0 * ivs);
|
||||
|
||||
return _mm512_i32gather_ps(index, x, _MM_SCALE_4);
|
||||
}
|
||||
|
||||
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
|
||||
6 * ovs + 1, 6 * ovs,
|
||||
5 * ovs + 1, 5 * ovs,
|
||||
4 * ovs + 1, 4 * ovs,
|
||||
3 * ovs + 1, 3 * ovs,
|
||||
2 * ovs + 1, 2 * ovs,
|
||||
1 * ovs + 1, 1 * ovs,
|
||||
0 * ovs + 1, 0 * ovs);
|
||||
|
||||
_mm512_i32scatter_ps(x, index, v, _MM_SCALE_4);
|
||||
}
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return (V)_mm512_shuffle_epi32((__m512i)x, _MM_PERM_CDAB);
|
||||
}
|
||||
|
||||
#define VDUPH(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_DDBB);
|
||||
#define VDUPL(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_CCAA);
|
||||
|
||||
#else /* !FFTW_SINGLE */
|
||||
#define VXOR(a,b) _mm512_xor_epi64(a,b)
|
||||
|
||||
#if defined (KCVI_LD_DOUBLE_USE_UNPACK) && KCVI_LD_DOUBLE_USE_UNPACK
|
||||
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
V temp;
|
||||
/* no need for hq here */
|
||||
temp = _mm512_mask_loadunpacklo_pd(temp, 0x0003, x + (0 * ivs));
|
||||
temp = _mm512_mask_loadunpacklo_pd(temp, 0x000c, x + (1 * ivs));
|
||||
temp = _mm512_mask_loadunpacklo_pd(temp, 0x0030, x + (2 * ivs));
|
||||
temp = _mm512_mask_loadunpacklo_pd(temp, 0x00c0, x + (3 * ivs));
|
||||
return temp;
|
||||
}
|
||||
#else
|
||||
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__declspec(align(64)) R temp[8];
|
||||
int i;
|
||||
for (i = 0 ; i < 4 ; i++) {
|
||||
temp[i*2] = x[i * ivs];
|
||||
temp[i*2+1] = x[i * ivs + 1];
|
||||
}
|
||||
return _mm512_load_pd(temp);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(KCVI_ST_DOUBLE_USE_PACK) && KCVI_ST_DOUBLE_USE_PACK
|
||||
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* no need for hq here */
|
||||
_mm512_mask_packstorelo_pd(x + (0 * ovs), 0x0003, v);
|
||||
_mm512_mask_packstorelo_pd(x + (1 * ovs), 0x000c, v);
|
||||
_mm512_mask_packstorelo_pd(x + (2 * ovs), 0x0030, v);
|
||||
_mm512_mask_packstorelo_pd(x + (3 * ovs), 0x00c0, v);
|
||||
}
|
||||
#else
|
||||
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__declspec(align(64)) R temp[8];
|
||||
int i;
|
||||
_mm512_store_pd(temp, v);
|
||||
for (i = 0 ; i < 4 ; i++) {
|
||||
x[i * ovs] = temp[i*2];
|
||||
x[i * ovs + 1] = temp[i*2+1];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return (V)_mm512_shuffle_epi32((__m512i)x, _MM_PERM_BADC);
|
||||
}
|
||||
|
||||
#define VDUPH(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_DCDC);
|
||||
#define VDUPL(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_BABA);
|
||||
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
#define LD LDu
|
||||
#define ST STu
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(15 * ovs, 14 * ovs,
|
||||
13 * ovs, 12 * ovs,
|
||||
11 * ovs, 10 * ovs,
|
||||
9 * ovs, 8 * ovs,
|
||||
7 * ovs, 6 * ovs,
|
||||
5 * ovs, 4 * ovs,
|
||||
3 * ovs, 2 * ovs,
|
||||
1 * ovs, 0 * ovs);
|
||||
|
||||
_mm512_i32scatter_ps(x, index, v, _MM_SCALE_4);
|
||||
}
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
|
||||
#else /* !FFTW_SINGLE */
|
||||
#if defined(KCVI_ST2_DOUBLE_USE_STN2) && KCVI_ST2_DOUBLE_USE_STN2
|
||||
#define STM2(x, v, ovs, a) /* no-op */
|
||||
static inline void STN2(R *x, V v0, V v1, INT ovs) {
|
||||
/* we start
|
||||
AB CD EF GH -> *x (2 DBL), ovs between complex
|
||||
IJ KL MN OP -> *(x+2) (2DBL), ovs between complex
|
||||
and we want
|
||||
ABIJ EFMN -> *x (4 DBL), 2 * ovs between complex pairs
|
||||
CDKL GHOP -> *(x+ovs) (4DBL), 2 * ovs between complex pairs
|
||||
*/
|
||||
V x00 = (V)_mm512_mask_permute4f128_epi32((__m512i)v0, 0xF0F0, (__m512i)v1, _MM_PERM_CDAB);
|
||||
V x01 = (V)_mm512_mask_permute4f128_epi32((__m512i)v1, 0x0F0F, (__m512i)v0, _MM_PERM_CDAB);
|
||||
_mm512_mask_packstorelo_pd(x + (0 * ovs) + 0, 0x000F, x00);
|
||||
/* _mm512_mask_packstorehi_pd(x + (0 * ovs) + 8, 0x000F, x00); */
|
||||
_mm512_mask_packstorelo_pd(x + (2 * ovs) + 0, 0x00F0, x00);
|
||||
/* _mm512_mask_packstorehi_pd(x + (2 * ovs) + 8, 0x00F0, x00); */
|
||||
_mm512_mask_packstorelo_pd(x + (1 * ovs) + 0, 0x000F, x01);
|
||||
/* _mm512_mask_packstorehi_pd(x + (1 * ovs) + 8, 0x000F, x01); */
|
||||
_mm512_mask_packstorelo_pd(x + (3 * ovs) + 0, 0x00F0, x01);
|
||||
/* _mm512_mask_packstorehi_pd(x + (3 * ovs) + 8, 0x00F0, x01); */
|
||||
}
|
||||
#else
|
||||
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
#endif
|
||||
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
__m512i index = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0,
|
||||
7 * ovs, 6 * ovs,
|
||||
5 * ovs, 4 * ovs,
|
||||
3 * ovs, 2 * ovs,
|
||||
1 * ovs, 0 * ovs);
|
||||
|
||||
_mm512_i32loscatter_pd(x, index, v, _MM_SCALE_8);
|
||||
}
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
static inline V VFMAI(V b, V c) {
|
||||
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
|
||||
return SUFF(_mm512_fmadd)(mpmp, SUFF(_mm512_swizzle)(b, _MM_SWIZ_REG_CDAB), c);
|
||||
}
|
||||
|
||||
static inline V VFNMSI(V b, V c) {
|
||||
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
|
||||
return SUFF(_mm512_fnmadd)(mpmp, SUFF(_mm512_swizzle)(b, _MM_SWIZ_REG_CDAB), c);
|
||||
}
|
||||
|
||||
static inline V VFMACONJ(V b, V c) {
|
||||
V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
|
||||
return SUFF(_mm512_fmadd)(pmpm, b, c);
|
||||
}
|
||||
|
||||
static inline V VFMSCONJ(V b, V c) {
|
||||
V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
|
||||
return SUFF(_mm512_fmsub)(pmpm, b, c);
|
||||
}
|
||||
|
||||
static inline V VFNMSCONJ(V b, V c) {
|
||||
V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
|
||||
return SUFF(_mm512_fnmadd)(pmpm, b, c);
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
V pmpm = VLIT(SCAL(-0.0), SCAL(0.0));
|
||||
return (V)VXOR((__m512i)pmpm, (__m512i)x);
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#if defined(KCVI_VBYI_SINGLE_USE_MUL) && KCVI_VBYI_SINGLE_USE_MUL
|
||||
/* untested */
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
|
||||
return _mm512_mul_ps(mpmp, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
|
||||
}
|
||||
#else
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
#endif
|
||||
#else /* !FFTW_SINGLE */
|
||||
#if defined(KCVI_VBYI_DOUBLE_USE_MUL) && KCVI_VBYI_DOUBLE_USE_MUL
|
||||
/* on KNF, using mul_pd is slower than shuf128x32 + xor */
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
|
||||
return _mm512_mul_pd(mpmp, _mm512_swizzle_pd(x, _MM_SWIZ_REG_CDAB));
|
||||
}
|
||||
#else
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
#endif
|
||||
#endif /* FFTW_SINGLE */
|
||||
|
||||
#if defined(KCVI_MULZ_USE_SWIZZLE) && KCVI_MULZ_USE_SWIZZLE
|
||||
static inline V VZMUL(V tx, V sr) /* (a,b) (c,d) */
|
||||
{
|
||||
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
|
||||
V ad = SUFF(_mm512_mul)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB)); /* (a*d,b*c) */
|
||||
V acmbd = SUFF(_mm512_sub)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (a*c-b*d, b*d-a*c) */
|
||||
V res = SUFF(_mm512_mask_add)(acmbd, DS(0x00aa,0xaaaa), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* ([a*c+b*c] a*c-b*d, b*c+a*d) */
|
||||
return res;
|
||||
}
|
||||
static inline V VZMULJ(V tx, V sr) /* (a,b) (c,d) */
|
||||
{
|
||||
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
|
||||
V ad = SUFF(_mm512_mul)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB)); /* (a*d,b*c) */
|
||||
V acmbd = SUFF(_mm512_add)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (a*c+b*d, b*d+a*c) */
|
||||
V res = SUFF(_mm512_mask_subr)(acmbd, DS(0x00aa,0xaaaa), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* ([a*c+b*c] a*c+b*d, a*d-b*c) */
|
||||
return res;
|
||||
}
|
||||
static inline V VZMULI(V tx, V sr) /* (a,b) (c,d) */
|
||||
{
|
||||
DVK(zero, SCAL(0.0));
|
||||
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
|
||||
V ad = SUFF(_mm512_fnmadd)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB), zero); /* (-a*d,-b*c) */
|
||||
V acmbd = SUFF(_mm512_subr)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (b*d-a*c, a*c-b*d) */
|
||||
V res = SUFF(_mm512_mask_add)(acmbd, DS(0x0055,0x5555), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* (-a*d-b*c, a*c-b*d) */
|
||||
return res;
|
||||
}
|
||||
static inline V VZMULIJ(V tx, V sr) /* (a,b) (c,d) */
|
||||
{
|
||||
DVK(zero, SCAL(0.0));
|
||||
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
|
||||
V ad = SUFF(_mm512_fnmadd)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB), zero); /* (-a*d,-b*c) */
|
||||
V acmbd = SUFF(_mm512_add)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (b*d+a*c, a*c+b*d) */
|
||||
V res = SUFF(_mm512_mask_sub)(acmbd, DS(0x0055,0x5555), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* (-a*d+b*c, a*c-b*d) */
|
||||
return res;
|
||||
}
|
||||
#else
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
|
||||
#endif /* FFTW_SINGLE */
|
||||
#define TWVL1 (VL)
|
||||
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
return VZMUL(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
return VZMULJ(LDA(t, 2, t), sr);
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
|
||||
{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
|
||||
{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
|
||||
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
|
||||
#endif /* FFTW_SINGLE */
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#define VTW3(v,x) VTW1(v,x)
|
||||
#define TWVL3 TWVL1
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v+1 , x}, {TW_COS, v+2 , x}, {TW_COS, v+3 , x}, \
|
||||
{TW_COS, v+4 , x}, {TW_COS, v+5 , x}, {TW_COS, v+6 , x}, {TW_COS, v+7 , x}, \
|
||||
{TW_COS, v+8 , x}, {TW_COS, v+9 , x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
|
||||
{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
|
||||
{TW_SIN, v , x}, {TW_SIN, v+1 , x}, {TW_SIN, v+2 , x}, {TW_SIN, v+3 , x}, \
|
||||
{TW_SIN, v+4 , x}, {TW_SIN, v+5 , x}, {TW_SIN, v+6 , x}, {TW_SIN, v+7 , x}, \
|
||||
{TW_SIN, v+8 , x}, {TW_SIN, v+9 , x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
|
||||
{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
|
||||
{TW_SIN, v , x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
|
||||
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
|
||||
#endif /* FFTW_SINGLE */
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
335
fftw-3.3.10/simd-support/simd-neon.h
Normal file
335
fftw-3.3.10/simd-support/simd-neon.h
Normal file
@@ -0,0 +1,335 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* Double-precision support added by Romain Dolbeau.
|
||||
* Romain Dolbeau hereby places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if !defined(FFTW_SINGLE) && !defined( __aarch64__)
|
||||
#error "NEON only works in single precision on 32 bits ARM"
|
||||
#endif
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
#error "NEON only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## _f32
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## _f64
|
||||
#endif
|
||||
|
||||
/* define these unconditionally, because they are used by
|
||||
taint.c which is compiled without neon */
|
||||
#define SIMD_SUFFIX _neon /* for renaming */
|
||||
#define VL DS(1,2) /* SIMD complex vector length */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#if defined(__GNUC__) && !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
||||
#error "compiling simd-neon.h requires -mfpu=neon or equivalent"
|
||||
#endif
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* FIXME: I am not sure whether this code assumes little-endian
|
||||
ordering. VLIT may or may not be wrong for big-endian systems. */
|
||||
typedef DS(float64x2_t, float32x4_t) V;
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VLIT(x0, x1) {x0, x1, x0, x1}
|
||||
#else
|
||||
# define VLIT(x0, x1) {x0, x1}
|
||||
#endif
|
||||
#define LDK(x) x
|
||||
#define DVK(var, val) const V var = VLIT(val, val)
|
||||
|
||||
/* NEON has FMA, but a three-operand FMA is not too useful
|
||||
for FFT purposes. We normally compute
|
||||
|
||||
t0=a+b*c
|
||||
t1=a-b*c
|
||||
|
||||
In a three-operand instruction set this translates into
|
||||
|
||||
t0=a
|
||||
t0+=b*c
|
||||
t1=a
|
||||
t1-=b*c
|
||||
|
||||
At least one move must be implemented, negating the advantage of
|
||||
the FMA in the first place. At least some versions of gcc generate
|
||||
both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
|
||||
#if ARCH_PREFERS_FMA
|
||||
#warning "--enable-fma on NEON is probably a bad idea (see source code)"
|
||||
#endif
|
||||
|
||||
#define VADD(a, b) SUFF(vaddq)(a, b)
|
||||
#define VSUB(a, b) SUFF(vsubq)(a, b)
|
||||
#define VMUL(a, b) SUFF(vmulq)(a, b)
|
||||
#define VFMA(a, b, c) SUFF(vmlaq)(c, a, b) /* a*b+c */
|
||||
#define VFNMS(a, b, c) SUFF(vmlsq)(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
|
||||
in ARM terminology */
|
||||
#define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent
|
||||
arm instruction (?) */
|
||||
|
||||
#define STOREH(a, v) SUFF(vst1)((a), SUFF(vget_high)(v))
|
||||
#define STOREL(a, v) SUFF(vst1)((a), SUFF(vget_low)(v))
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void) aligned_like; /* UNUSED */
|
||||
return SUFF(vld1q)(x);
|
||||
}
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void) aligned_like; /* UNUSED */
|
||||
SUFF(vst1q)(x, v);
|
||||
}
|
||||
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void) aligned_like; /* UNUSED */
|
||||
return SUFF(vcombine)(SUFF(vld1)(x), SUFF(vld1)((x + ivs)));
|
||||
}
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void) aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon store-low occurring
|
||||
after store-high */
|
||||
STOREH(x + ovs, v);
|
||||
STOREL(x,v);
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define LD LDA
|
||||
# define ST STA
|
||||
#endif
|
||||
|
||||
/* 2x2 complex transpose and store */
|
||||
#define STM2 DS(STA,ST)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
/* store and 4x4 real transpose */
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void) aligned_like; /* UNUSED */
|
||||
SUFF(vst1_lane)((x) , SUFF(vget_low)(v), 0);
|
||||
SUFF(vst1_lane)((x + ovs), SUFF(vget_low)(v), 1);
|
||||
SUFF(vst1_lane)((x + 2 * ovs), SUFF(vget_high)(v), 0);
|
||||
SUFF(vst1_lane)((x + 3 * ovs), SUFF(vget_high)(v), 1);
|
||||
}
|
||||
#define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */
|
||||
#else /* !FFTW_SINGLE */
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
STOREL(x, v);
|
||||
STOREH(x + ovs, v);
|
||||
}
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#define FLIP_RI(x) SUFF(vrev64q)(x)
|
||||
#else
|
||||
/* FIXME */
|
||||
#define FLIP_RI(x) SUFF(vcombine)(SUFF(vget_high)(x), SUFF(vget_low)(x))
|
||||
#endif
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
#ifdef FFTW_SINGLE
|
||||
static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
|
||||
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
|
||||
#else
|
||||
static const uint64x2_t pm = {0, 0x8000000000000000ull};
|
||||
/* Gcc-4.9.2 still does not include vreinterpretq_f64_u64, but simple
|
||||
* casts generate the correct assembly.
|
||||
*/
|
||||
return (float64x2_t)(veorq_u64((uint64x2_t)(x), (uint64x2_t)(pm)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
static inline V VFMAI(V b, V c)
|
||||
{
|
||||
const V mp = VLIT(-1.0, 1.0);
|
||||
return VFMA(FLIP_RI(b), mp, c);
|
||||
}
|
||||
|
||||
static inline V VFNMSI(V b, V c)
|
||||
{
|
||||
const V mp = VLIT(-1.0, 1.0);
|
||||
return VFNMS(FLIP_RI(b), mp, c);
|
||||
}
|
||||
|
||||
static inline V VFMACONJ(V b, V c)
|
||||
{
|
||||
const V pm = VLIT(1.0, -1.0);
|
||||
return VFMA(b, pm, c);
|
||||
}
|
||||
|
||||
static inline V VFNMSCONJ(V b, V c)
|
||||
{
|
||||
const V pm = VLIT(1.0, -1.0);
|
||||
return VFNMS(b, pm, c);
|
||||
}
|
||||
|
||||
static inline V VFMSCONJ(V b, V c)
|
||||
{
|
||||
return VSUB(VCONJ(b), c);
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
#if 1
|
||||
#define VEXTRACT_REIM(tr, ti, tx) \
|
||||
{ \
|
||||
tr = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 0), \
|
||||
SUFF(vdup_lane)(SUFF(vget_high)(tx), 0)); \
|
||||
ti = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 1), \
|
||||
SUFF(vdup_lane)(SUFF(vget_high)(tx), 1)); \
|
||||
}
|
||||
#else
|
||||
/* this alternative might be faster in an ideal world, but gcc likes
|
||||
to spill VVV onto the stack */
|
||||
#define VEXTRACT_REIM(tr, ti, tx) \
|
||||
{ \
|
||||
float32x4x2_t vvv = SUFF(vtrnq)(tx, tx); \
|
||||
tr = vvv.val[0]; \
|
||||
ti = vvv.val[1]; \
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#define VEXTRACT_REIM(tr, ti, tx) \
|
||||
{ \
|
||||
tr = SUFF(vtrn1q)(tx, tx); \
|
||||
ti = SUFF(vtrn2q)(tx, tx); \
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr, ti;
|
||||
VEXTRACT_REIM(tr, ti, tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr, ti;
|
||||
VEXTRACT_REIM(tr, ti, tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr, ti;
|
||||
VEXTRACT_REIM(tr, ti, tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr, ti;
|
||||
VEXTRACT_REIM(tr, ti, tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
#define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
#else
|
||||
#define VTW1(v,x) {TW_CEXP, v, x}
|
||||
#endif
|
||||
#define TWVL1 VL
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LDA(t, 2, 0);
|
||||
return VZMUL(tx, sr);
|
||||
}
|
||||
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LDA(t, 2, 0);
|
||||
return VZMULJ(tx, sr);
|
||||
}
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#else
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
|
||||
return VFMA(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
#else
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}
|
||||
#endif
|
||||
# define TWVL3 (VL)
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
380
fftw-3.3.10/simd-support/simd-sse2.h
Normal file
380
fftw-3.3.10/simd-support/simd-sse2.h
Normal file
@@ -0,0 +1,380 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
# error "SSE/SSE2 only works in single/double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## s
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## d
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _sse2 /* for renaming */
|
||||
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#if defined(__GNUC__) && !defined(FFTW_SINGLE) && !defined(__SSE2__)
|
||||
# error "compiling simd-sse2.h in double precision without -msse2"
|
||||
#elif defined(__GNUC__) && defined(FFTW_SINGLE) && !defined(__SSE__)
|
||||
# error "compiling simd-sse2.h in single precision without -msse"
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* some versions of glibc's sys/cdefs.h define __inline to be empty,
|
||||
which is wrong because emmintrin.h defines several inline
|
||||
procedures */
|
||||
#ifndef _MSC_VER
|
||||
#undef __inline
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# include <xmmintrin.h>
|
||||
#else
|
||||
# include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
typedef DS(__m128d,__m128) V;
|
||||
#define VADD SUFF(_mm_add_p)
|
||||
#define VSUB SUFF(_mm_sub_p)
|
||||
#define VMUL SUFF(_mm_mul_p)
|
||||
#define VXOR SUFF(_mm_xor_p)
|
||||
#define SHUF SUFF(_mm_shuffle_p)
|
||||
#define UNPCKL SUFF(_mm_unpacklo_p)
|
||||
#define UNPCKH SUFF(_mm_unpackhi_p)
|
||||
|
||||
#define SHUFVALS(fp0,fp1,fp2,fp3) \
|
||||
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
|
||||
|
||||
#define VDUPL(x) DS(UNPCKL(x, x), SHUF(x, x, SHUFVALS(0, 0, 2, 2)))
|
||||
#define VDUPH(x) DS(UNPCKH(x, x), SHUF(x, x, SHUFVALS(1, 1, 3, 3)))
|
||||
#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
|
||||
#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
|
||||
|
||||
|
||||
#ifdef __GNUC__
|
||||
/*
|
||||
* gcc-3.3 generates slow code for mm_set_ps (write all elements to
|
||||
* the stack and load __m128 from the stack).
|
||||
*
|
||||
* gcc-3.[34] generates slow code for mm_set_ps1 (load into low element
|
||||
* and shuffle).
|
||||
*
|
||||
* This hack forces gcc to generate a constant __m128 at compile time.
|
||||
*/
|
||||
union rvec {
|
||||
R r[DS(2,4)];
|
||||
V v;
|
||||
};
|
||||
|
||||
# ifdef FFTW_SINGLE
|
||||
# define DVK(var, val) V var = __extension__ ({ \
|
||||
static const union rvec _var = { {val,val,val,val} }; _var.v; })
|
||||
# else
|
||||
# define DVK(var, val) V var = __extension__ ({ \
|
||||
static const union rvec _var = { {val,val} }; _var.v; })
|
||||
# endif
|
||||
# define LDK(x) x
|
||||
#else
|
||||
# define DVK(var, val) const R var = K(val)
|
||||
# define LDK(x) DS(_mm_set1_pd,_mm_set_ps1)(x)
|
||||
#endif
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ivs; /* UNUSED */
|
||||
return *(const V *)x;
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
(void)ovs; /* UNUSED */
|
||||
*(V *)x = v;
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
|
||||
# ifdef _MSC_VER
|
||||
/* Temporarily disable the warning "uninitialized local variable
|
||||
'name' used" and runtime checks for using a variable before it is
|
||||
defined which is erroneously triggered by the LOADL0 / LOADH macros
|
||||
as they only modify VAL partly each. */
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(disable : 4700)
|
||||
# pragma runtime_checks("u", off)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(disable : 592)
|
||||
# endif
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
V var;
|
||||
(void)aligned_like; /* UNUSED */
|
||||
# ifdef __GNUC__
|
||||
/* We use inline asm because gcc-3.x generates slow code for
|
||||
_mm_loadh_pi(). gcc-3.x insists upon having an existing variable for
|
||||
VAL, which is however never used. Thus, it generates code to move
|
||||
values in and out the variable. Worse still, gcc-4.0 stores VAL on
|
||||
the stack, causing valgrind to complain about uninitialized reads. */
|
||||
__asm__("movlps %1, %0\n\tmovhps %2, %0"
|
||||
: "=x"(var) : "m"(x[0]), "m"(x[ivs]));
|
||||
# else
|
||||
# define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
|
||||
# define LOADL0(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
|
||||
var = LOADL0(x, var);
|
||||
var = LOADH(x + ivs, var);
|
||||
# endif
|
||||
return var;
|
||||
}
|
||||
|
||||
# ifdef _MSC_VER
|
||||
# ifndef __INTEL_COMPILER
|
||||
# pragma warning(default : 4700)
|
||||
# pragma runtime_checks("u", restore)
|
||||
# endif
|
||||
# endif
|
||||
# ifdef __INTEL_COMPILER
|
||||
# pragma warning(default : 592)
|
||||
# endif
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
/* WARNING: the extra_iter hack depends upon STOREL occurring
|
||||
after STOREH */
|
||||
STOREH(x + ovs, v);
|
||||
STOREL(x, v);
|
||||
}
|
||||
|
||||
#else /* ! FFTW_SINGLE */
|
||||
# define LD LDA
|
||||
# define ST STA
|
||||
#endif
|
||||
|
||||
#define STM2 DS(STA,ST)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
/* STN4 is a macro, not a function, thanks to Visual C++ developers
|
||||
deciding "it would be infrequent that people would want to pass more
|
||||
than 3 [__m128 parameters] by value." 3 parameters ought to be enough
|
||||
for anybody. */
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) \
|
||||
{ \
|
||||
V xxx0, xxx1, xxx2, xxx3; \
|
||||
xxx0 = UNPCKL(v0, v2); \
|
||||
xxx1 = UNPCKH(v0, v2); \
|
||||
xxx2 = UNPCKL(v1, v3); \
|
||||
xxx3 = UNPCKH(v1, v3); \
|
||||
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
|
||||
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
|
||||
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
|
||||
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
STOREL(x, v);
|
||||
STOREH(x + ovs, v);
|
||||
}
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
||||
#endif
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
return SHUF(x, x, DS(1, SHUFVALS(1, 0, 3, 2)));
|
||||
}
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
/* This will produce -0.0f (or -0.0d) even on broken
|
||||
compilers that do not distinguish +0.0 from -0.0.
|
||||
I bet some are still around. */
|
||||
union uvec {
|
||||
unsigned u[4];
|
||||
V v;
|
||||
};
|
||||
/* it looks like gcc-3.3.5 produces slow code unless PM is
|
||||
declared static. */
|
||||
static const union uvec pm = {
|
||||
#ifdef FFTW_SINGLE
|
||||
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
|
||||
#else
|
||||
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
|
||||
#endif
|
||||
};
|
||||
return VXOR(pm.v, x);
|
||||
}
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
x = VCONJ(x);
|
||||
x = FLIP_RI(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
|
||||
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
|
||||
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
|
||||
#define VFMAI(b, c) VADD(c, VBYI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V tx = twp[0];
|
||||
V tr = UNPCKL(tx, tx);
|
||||
V ti = UNPCKH(tx, tx);
|
||||
tr = VMUL(tr, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V tx = twp[0];
|
||||
V tr = UNPCKL(tx, tx);
|
||||
V ti = UNPCKH(tx, tx);
|
||||
tr = VMUL(tr, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMUL(tx, sr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMULJ(tx, sr);
|
||||
}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFMA(tr, sr, VMUL(ti, si));
|
||||
}
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
const V *twp = (const V *)t;
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = twp[0], ti = twp[1];
|
||||
return VFNMS(ti, si, VMUL(tr, sr));
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
# define TWVL3 (VL)
|
||||
#else
|
||||
# define VTW3(v,x) VTW1(v,x)
|
||||
# define TWVL3 TWVL1
|
||||
#endif
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
299
fftw-3.3.10/simd-support/simd-vsx.h
Normal file
299
fftw-3.3.10/simd-support/simd-vsx.h
Normal file
@@ -0,0 +1,299 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* VSX SIMD implementation added 2015 Erik Lindahl.
|
||||
* Erik Lindahl places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
||||
# error "VSX only works in single or double precision"
|
||||
#endif
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
# define SUFF(name) name ## s
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
# define SUFF(name) name ## d
|
||||
#endif
|
||||
|
||||
#define SIMD_SUFFIX _vsx /* for renaming */
|
||||
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
|
||||
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
|
||||
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
||||
|
||||
#include <altivec.h>
|
||||
#include <stdio.h>
|
||||
|
||||
typedef DS(vector double,vector float) V;
|
||||
|
||||
#define VADD(a,b) vec_add(a,b)
|
||||
#define VSUB(a,b) vec_sub(a,b)
|
||||
#define VMUL(a,b) vec_mul(a,b)
|
||||
#define VXOR(a,b) vec_xor(a,b)
|
||||
#define UNPCKL(a,b) vec_mergel(a,b)
|
||||
#define UNPCKH(a,b) vec_mergeh(a,b)
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}; vec_perm(a,a,perm); })
|
||||
# define VDUPH(a) ({ const vector unsigned char perm = {4,5,6,7,4,5,6,7,12,13,14,15,12,13,14,15}; vec_perm(a,a,perm); })
|
||||
#else
|
||||
# define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}; vec_perm(a,a,perm); })
|
||||
# define VDUPH(a) ({ const vector unsigned char perm = {8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15}; vec_perm(a,a,perm); })
|
||||
#endif
|
||||
|
||||
static inline V LDK(R f) { return vec_splats(f); }
|
||||
|
||||
#define DVK(var, val) const R var = K(val)
|
||||
|
||||
static inline V VCONJ(V x)
|
||||
{
|
||||
const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
|
||||
return vec_xor(x, pmpm);
|
||||
}
|
||||
|
||||
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
#ifdef __ibmxl__
|
||||
return vec_xl(0,(DS(double,float) *)x);
|
||||
#else
|
||||
return (*(const V *)(x));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
#ifdef __ibmxl__
|
||||
vec_xst(v,0,x);
|
||||
#else
|
||||
*(V *)x = v;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline V FLIP_RI(V x)
|
||||
{
|
||||
#ifdef FFTW_SINGLE
|
||||
const vector unsigned char perm = { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 };
|
||||
#else
|
||||
const vector unsigned char perm = { 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 };
|
||||
#endif
|
||||
return vec_perm(x,x,perm);
|
||||
}
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
|
||||
static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
||||
{
|
||||
const vector unsigned char perm = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
|
||||
|
||||
return vec_perm((vector float)vec_splats(*(double *)(x)),
|
||||
(vector float)vec_splats(*(double *)(x+ivs)),perm);
|
||||
}
|
||||
|
||||
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
*(double *)(x+ovs) = vec_extract( (vector double)v, 1 );
|
||||
*(double *)x = vec_extract( (vector double)v, 0 );
|
||||
}
|
||||
#else
|
||||
/* DOUBLE */
|
||||
|
||||
# define LD LDA
|
||||
# define ST STA
|
||||
|
||||
#endif
|
||||
|
||||
#define STM2 DS(STA,ST)
|
||||
#define STN2(x, v0, v1, ovs) /* nop */
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
|
||||
# define STM4(x, v, ovs, aligned_like) /* no-op */
|
||||
static inline void STN4(R *x, V v0, V v1, V v2, V v3, int ovs)
|
||||
{
|
||||
V xxx0, xxx1, xxx2, xxx3;
|
||||
xxx0 = vec_mergeh(v0,v1);
|
||||
xxx1 = vec_mergel(v0,v1);
|
||||
xxx2 = vec_mergeh(v2,v3);
|
||||
xxx3 = vec_mergel(v2,v3);
|
||||
*(double *)x = vec_extract( (vector double)xxx0, 0 );
|
||||
*(double *)(x+ovs) = vec_extract( (vector double)xxx0, 1 );
|
||||
*(double *)(x+2*ovs) = vec_extract( (vector double)xxx1, 0 );
|
||||
*(double *)(x+3*ovs) = vec_extract( (vector double)xxx1, 1 );
|
||||
*(double *)(x+2) = vec_extract( (vector double)xxx2, 0 );
|
||||
*(double *)(x+ovs+2) = vec_extract( (vector double)xxx2, 1 );
|
||||
*(double *)(x+2*ovs+2) = vec_extract( (vector double)xxx3, 0 );
|
||||
*(double *)(x+3*ovs+2) = vec_extract( (vector double)xxx3, 1 );
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
|
||||
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
||||
{
|
||||
(void)aligned_like; /* UNUSED */
|
||||
x[0] = vec_extract(v,0);
|
||||
x[ovs] = vec_extract(v,1);
|
||||
}
|
||||
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
||||
#endif
|
||||
|
||||
static inline V VBYI(V x)
|
||||
{
|
||||
/* FIXME [matteof 2017-09-21] It is possible to use vpermxor(),
|
||||
but gcc and xlc treat the permutation bits differently, and
|
||||
gcc-6 seems to generate incorrect code when using
|
||||
__builtin_crypto_vpermxor() (i.e., VBYI() works for a small
|
||||
test case but fails in the large).
|
||||
|
||||
Punt on vpermxor() for now and do the simple thing.
|
||||
*/
|
||||
return FLIP_RI(VCONJ(x));
|
||||
}
|
||||
|
||||
/* FMA support */
|
||||
#define VFMA(a, b, c) vec_madd(a,b,c)
|
||||
#define VFNMS(a, b, c) vec_nmsub(a,b,c)
|
||||
#define VFMS(a, b, c) vec_msub(a,b,c)
|
||||
#define VFMAI(b, c) VADD(c, VBYI(b))
|
||||
#define VFNMSI(b, c) VSUB(c, VBYI(b))
|
||||
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
|
||||
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
|
||||
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
|
||||
|
||||
static inline V VZMUL(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
tr = VMUL(sr, tr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
|
||||
static inline V VZMULI(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMS(tr, sr, ti);
|
||||
}
|
||||
|
||||
static inline V VZMULIJ(V tx, V sr)
|
||||
{
|
||||
V tr = VDUPL(tx);
|
||||
V ti = VDUPH(tx);
|
||||
ti = VMUL(ti, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(tr, sr, ti);
|
||||
}
|
||||
|
||||
/* twiddle storage #1: compact, slower */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW1(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LDA(t,0,t);
|
||||
V tr = UNPCKH(tx, tx);
|
||||
V ti = UNPCKL(tx, tx);
|
||||
tr = VMUL(tr, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFMA(ti, sr, tr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LDA(t,0,t);
|
||||
V tr = UNPCKH(tx, tx);
|
||||
V ti = UNPCKL(tx, tx);
|
||||
tr = VMUL(tr, sr);
|
||||
sr = VBYI(sr);
|
||||
return VFNMS(ti, sr, tr);
|
||||
}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW1(v,x) {TW_CEXP, v, x}
|
||||
static inline V BYTW1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMUL(tx, sr);
|
||||
}
|
||||
static inline V BYTWJ1(const R *t, V sr)
|
||||
{
|
||||
V tx = LD(t, 1, t);
|
||||
return VZMULJ(tx, sr);
|
||||
}
|
||||
#endif
|
||||
#define TWVL1 (VL)
|
||||
|
||||
/* twiddle storage #2: twice the space, faster (when in cache) */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
||||
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
||||
#else /* !FFTW_SINGLE */
|
||||
# define VTW2(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
||||
#endif
|
||||
#define TWVL2 (2 * VL)
|
||||
static inline V BYTW2(const R *t, V sr)
|
||||
{
|
||||
V si = FLIP_RI(sr);
|
||||
V ti = LDA(t+2*VL,0,t);
|
||||
V tt = VMUL(ti, si);
|
||||
V tr = LDA(t,0,t);
|
||||
return VFMA(tr, sr, tt);
|
||||
}
|
||||
static inline V BYTWJ2(const R *t, V sr)
|
||||
{
|
||||
V si = FLIP_RI(sr);
|
||||
V tr = LDA(t,0,t);
|
||||
V tt = VMUL(tr, sr);
|
||||
V ti = LDA(t+2*VL,0,t);
|
||||
return VFNMS(ti, si, tt);
|
||||
}
|
||||
|
||||
/* twiddle storage #3 */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
||||
# define TWVL3 (VL)
|
||||
#else
|
||||
# define VTW3(v,x) VTW1(v,x)
|
||||
# define TWVL3 TWVL1
|
||||
#endif
|
||||
|
||||
/* twiddle storage for split arrays */
|
||||
#ifdef FFTW_SINGLE
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
||||
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
||||
#else
|
||||
# define VTWS(v,x) \
|
||||
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
||||
#endif
|
||||
#define TWVLS (2 * VL)
|
||||
|
||||
#define VLEAVE() /* nothing */
|
||||
|
||||
#include "simd-common.h"
|
||||
89
fftw-3.3.10/simd-support/sse2.c
Normal file
89
fftw-3.3.10/simd-support/sse2.c
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#ifdef FFTW_SINGLE
|
||||
# define DS(d,s) s /* single-precision option */
|
||||
#else
|
||||
# define DS(d,s) d /* double-precision option */
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
|
||||
# if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
|
||||
int X(have_simd_sse2)(void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
# else /* !x86_64 */
|
||||
|
||||
# include <signal.h>
|
||||
# include <setjmp.h>
|
||||
# include "x86-cpuid.h"
|
||||
|
||||
static jmp_buf jb;
|
||||
|
||||
static void sighandler(int x)
|
||||
{
|
||||
UNUSED(x);
|
||||
longjmp(jb, 1);
|
||||
}
|
||||
|
||||
static int sse2_works(void)
|
||||
{
|
||||
void (*oldsig)(int);
|
||||
oldsig = signal(SIGILL, sighandler);
|
||||
if (setjmp(jb)) {
|
||||
signal(SIGILL, oldsig);
|
||||
return 0;
|
||||
} else {
|
||||
# ifdef _MSC_VER
|
||||
_asm { DS(xorpd,xorps) xmm0,xmm0 }
|
||||
# else
|
||||
/* asm volatile ("xorpd/s %xmm0, %xmm0"); */
|
||||
asm volatile(DS(".byte 0x66; .byte 0x0f; .byte 0x57; .byte 0xc0",
|
||||
".byte 0x0f; .byte 0x57; .byte 0xc0"));
|
||||
# endif
|
||||
signal(SIGILL, oldsig);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int X(have_simd_sse2)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
|
||||
if (!init) {
|
||||
res = !is_386()
|
||||
&& has_cpuid()
|
||||
&& (cpuid_edx(1) & (1 << DS(26,25)))
|
||||
&& sse2_works();
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
# endif
|
||||
|
||||
#endif
|
||||
43
fftw-3.3.10/simd-support/taint.c
Normal file
43
fftw-3.3.10/simd-support/taint.c
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
#include "simd-common.h"
|
||||
|
||||
#if HAVE_SIMD
|
||||
|
||||
R *X(taint)(R *p, INT s)
|
||||
{
|
||||
if (((unsigned)s * sizeof(R)) % ALIGNMENT)
|
||||
p = (R *) (PTRINT(p) | TAINT_BIT);
|
||||
if (((unsigned)s * sizeof(R)) % ALIGNMENTA)
|
||||
p = (R *) (PTRINT(p) | TAINT_BITA);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* join the taint of two pointers that are supposed to be
|
||||
the same modulo the taint */
|
||||
R *X(join_taint)(R *p1, R *p2)
|
||||
{
|
||||
A(UNTAINT(p1) == UNTAINT(p2));
|
||||
return (R *)(PTRINT(p1) | PTRINT(p2));
|
||||
}
|
||||
#endif
|
||||
69
fftw-3.3.10/simd-support/vsx.c
Normal file
69
fftw-3.3.10/simd-support/vsx.c
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* VSX SIMD implementation added 2015 Erik Lindahl.
|
||||
* Erik Lindahl places his modifications in the public domain.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
#if HAVE_VSX
|
||||
|
||||
#if HAVE_SYS_SYSCTL_H
|
||||
# include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
static jmp_buf jb;
|
||||
|
||||
static void sighandler(int x)
|
||||
{
|
||||
longjmp(jb, 1);
|
||||
}
|
||||
|
||||
static int really_have_vsx(void)
|
||||
{
|
||||
void (*oldsig)(int);
|
||||
oldsig = signal(SIGILL, sighandler);
|
||||
if (setjmp(jb)) {
|
||||
signal(SIGILL, oldsig);
|
||||
return 0;
|
||||
} else {
|
||||
float mem[2];
|
||||
__asm__ __volatile__ ("stxsdx 0,0,%0" :: "r" (mem) : "memory" );
|
||||
signal(SIGILL, oldsig);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int X(have_simd_vsx)(void)
|
||||
{
|
||||
static int init = 0, res;
|
||||
if (!init) {
|
||||
res = really_have_vsx();
|
||||
init = 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
212
fftw-3.3.10/simd-support/x86-cpuid.h
Normal file
212
fftw-3.3.10/simd-support/x86-cpuid.h
Normal file
@@ -0,0 +1,212 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* this code was kindly donated by Eric J. Korpela */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#ifndef inline
|
||||
#define inline __inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int is_386()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
unsigned int result,tst;
|
||||
_asm {
|
||||
pushfd
|
||||
pop eax
|
||||
mov edx,eax
|
||||
xor eax,40000h
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
push edx
|
||||
popfd
|
||||
mov tst,edx
|
||||
mov result,eax
|
||||
}
|
||||
#else
|
||||
register unsigned int result,tst;
|
||||
__asm__ (
|
||||
"pushfl\n\t"
|
||||
"popl %0\n\t"
|
||||
"movl %0,%1\n\t"
|
||||
"xorl $0x40000,%0\n\t"
|
||||
"pushl %0\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %0\n\t"
|
||||
"pushl %1\n\t"
|
||||
"popfl"
|
||||
: "=r" (result), "=r" (tst) /* output */
|
||||
: /* no inputs */
|
||||
);
|
||||
#endif
|
||||
return (result == tst);
|
||||
}
|
||||
|
||||
static inline int has_cpuid()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
unsigned int result,tst;
|
||||
_asm {
|
||||
pushfd
|
||||
pop eax
|
||||
mov edx,eax
|
||||
xor eax,200000h
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
push edx
|
||||
popfd
|
||||
mov tst,edx
|
||||
mov result,eax
|
||||
}
|
||||
#else
|
||||
register unsigned int result,tst;
|
||||
__asm__ (
|
||||
"pushfl\n\t"
|
||||
"pop %0\n\t"
|
||||
"movl %0,%1\n\t"
|
||||
"xorl $0x200000,%0\n\t"
|
||||
"pushl %0\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %0\n\t"
|
||||
"pushl %1\n\t"
|
||||
"popfl"
|
||||
: "=r" (result), "=r" (tst) /* output */
|
||||
: /* no inputs */
|
||||
);
|
||||
#endif
|
||||
return (result != tst);
|
||||
}
|
||||
|
||||
/* cpuid version to get all registers. Donated by Erik Lindahl from Gromacs. */
|
||||
static inline void
|
||||
cpuid_all(int level, int ecxval, int *eax, int *ebx, int *ecx, int *edx)
|
||||
{
|
||||
#if (defined _MSC_VER)
|
||||
int CPUInfo[4];
|
||||
|
||||
# if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
|
||||
/* MSVC 9.0 SP1 or later */
|
||||
__cpuidex(CPUInfo, level, ecxval);
|
||||
# else
|
||||
__cpuid(CPUInfo, level);
|
||||
/* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
|
||||
# endif
|
||||
*eax = CPUInfo[0];
|
||||
*ebx = CPUInfo[1];
|
||||
*ecx = CPUInfo[2];
|
||||
*edx = CPUInfo[3];
|
||||
|
||||
#else
|
||||
/* Not MSVC */
|
||||
*eax = level;
|
||||
*ecx = ecxval;
|
||||
*ebx = 0;
|
||||
*edx = 0;
|
||||
/* Avoid clobbering global offset table in 32-bit pic code (ebx) */
|
||||
# if defined(__PIC__)
|
||||
__asm__ ("xchgl %%ebx, %1 \n\t"
|
||||
"cpuid \n\t"
|
||||
"xchgl %%ebx, %1 \n\t"
|
||||
: "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
|
||||
# else
|
||||
/* No need to save ebx if we are not in pic mode */
|
||||
__asm__ ("cpuid \n\t"
|
||||
: "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int cpuid_edx(int op)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
int result;
|
||||
_asm {
|
||||
push ebx
|
||||
mov eax,op
|
||||
cpuid
|
||||
mov result,edx
|
||||
pop ebx
|
||||
}
|
||||
return result;
|
||||
# else
|
||||
int eax, ecx, edx;
|
||||
|
||||
__asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
|
||||
: "=a" (eax), "=c" (ecx), "=d" (edx)
|
||||
: "a" (op));
|
||||
return edx;
|
||||
# endif
|
||||
}
|
||||
|
||||
static inline int cpuid_ecx(int op)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
int result;
|
||||
_asm {
|
||||
push ebx
|
||||
mov eax,op
|
||||
cpuid
|
||||
mov result,ecx
|
||||
pop ebx
|
||||
}
|
||||
return result;
|
||||
# else
|
||||
int eax, ecx, edx;
|
||||
|
||||
__asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
|
||||
: "=a" (eax), "=c" (ecx), "=d" (edx)
|
||||
: "a" (op));
|
||||
return ecx;
|
||||
# endif
|
||||
}
|
||||
|
||||
static inline int xgetbv_eax(int op)
|
||||
{
|
||||
# ifdef _MSC_VER
|
||||
int veax, vedx;
|
||||
_asm {
|
||||
mov ecx,op
|
||||
# if defined(__INTEL_COMPILER) || (_MSC_VER >= 1600)
|
||||
xgetbv
|
||||
# else
|
||||
__emit 15
|
||||
__emit 1
|
||||
__emit 208
|
||||
# endif
|
||||
mov veax,eax
|
||||
mov vedx,edx
|
||||
}
|
||||
return veax;
|
||||
# else
|
||||
int eax, edx;
|
||||
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
|
||||
return eax;
|
||||
#endif
|
||||
}
|
||||
Reference in New Issue
Block a user