This commit is contained in:
2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = libsimd_support.la
libsimd_support_la_SOURCES = taint.c simd-common.h \
x86-cpuid.h amd64-cpuid.h \
simd-sse2.h sse2.c \
avx.c simd-avx.h \
avx-128-fma.c simd-avx-128-fma.h \
avx2.c simd-avx2.h simd-avx2-128.h \
avx512.c simd-avx512.h \
kcvi.c simd-kcvi.h \
altivec.c simd-altivec.h vsx.c simd-vsx.h \
neon.c simd-neon.h \
simd-generic128.h simd-generic256.h

View File

@@ -0,0 +1,680 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = simd-support
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
libsimd_support_la_LIBADD =
am_libsimd_support_la_OBJECTS = taint.lo sse2.lo avx.lo avx-128-fma.lo \
avx2.lo avx512.lo kcvi.lo altivec.lo vsx.lo neon.lo
libsimd_support_la_OBJECTS = $(am_libsimd_support_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/altivec.Plo \
./$(DEPDIR)/avx-128-fma.Plo ./$(DEPDIR)/avx.Plo \
./$(DEPDIR)/avx2.Plo ./$(DEPDIR)/avx512.Plo \
./$(DEPDIR)/kcvi.Plo ./$(DEPDIR)/neon.Plo ./$(DEPDIR)/sse2.Plo \
./$(DEPDIR)/taint.Plo ./$(DEPDIR)/vsx.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(libsimd_support_la_SOURCES)
DIST_SOURCES = $(libsimd_support_la_SOURCES)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = libsimd_support.la
libsimd_support_la_SOURCES = taint.c simd-common.h \
x86-cpuid.h amd64-cpuid.h \
simd-sse2.h sse2.c \
avx.c simd-avx.h \
avx-128-fma.c simd-avx-128-fma.h \
avx2.c simd-avx2.h simd-avx2-128.h \
avx512.c simd-avx512.h \
kcvi.c simd-kcvi.h \
altivec.c simd-altivec.h vsx.c simd-vsx.h \
neon.c simd-neon.h \
simd-generic128.h simd-generic256.h
all: all-am
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu simd-support/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu simd-support/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
libsimd_support.la: $(libsimd_support_la_OBJECTS) $(libsimd_support_la_DEPENDENCIES) $(EXTRA_libsimd_support_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(libsimd_support_la_OBJECTS) $(libsimd_support_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/altivec.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx-128-fma.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx512.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kcvi.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/neon.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sse2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/taint.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vsx.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-am
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-am
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-am
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-am
all-am: Makefile $(LTLIBRARIES)
installdirs:
install: install-am
install-exec: install-exec-am
install-data: install-data-am
uninstall: uninstall-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-am
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-am
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-am
-rm -f ./$(DEPDIR)/altivec.Plo
-rm -f ./$(DEPDIR)/avx-128-fma.Plo
-rm -f ./$(DEPDIR)/avx.Plo
-rm -f ./$(DEPDIR)/avx2.Plo
-rm -f ./$(DEPDIR)/avx512.Plo
-rm -f ./$(DEPDIR)/kcvi.Plo
-rm -f ./$(DEPDIR)/neon.Plo
-rm -f ./$(DEPDIR)/sse2.Plo
-rm -f ./$(DEPDIR)/taint.Plo
-rm -f ./$(DEPDIR)/vsx.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-am
dvi-am:
html: html-am
html-am:
info: info-am
info-am:
install-data-am:
install-dvi: install-dvi-am
install-dvi-am:
install-exec-am:
install-html: install-html-am
install-html-am:
install-info: install-info-am
install-info-am:
install-man:
install-pdf: install-pdf-am
install-pdf-am:
install-ps: install-ps-am
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-am
-rm -f ./$(DEPDIR)/altivec.Plo
-rm -f ./$(DEPDIR)/avx-128-fma.Plo
-rm -f ./$(DEPDIR)/avx.Plo
-rm -f ./$(DEPDIR)/avx2.Plo
-rm -f ./$(DEPDIR)/avx512.Plo
-rm -f ./$(DEPDIR)/kcvi.Plo
-rm -f ./$(DEPDIR)/neon.Plo
-rm -f ./$(DEPDIR)/sse2.Plo
-rm -f ./$(DEPDIR)/taint.Plo
-rm -f ./$(DEPDIR)/vsx.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-am
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-am
pdf-am:
ps: ps-am
ps-am:
uninstall-am:
.MAKE: install-am install-strip
.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
clean-generic clean-libtool clean-noinstLTLIBRARIES \
cscopelist-am ctags ctags-am distclean distclean-compile \
distclean-generic distclean-libtool distclean-tags distdir dvi \
dvi-am html html-am info info-am install install-am \
install-data install-data-am install-dvi install-dvi-am \
install-exec install-exec-am install-html install-html-am \
install-info install-info-am install-man install-pdf \
install-pdf-am install-ps install-ps-am install-strip \
installcheck installcheck-am installdirs maintainer-clean \
maintainer-clean-generic mostlyclean mostlyclean-compile \
mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
tags tags-am uninstall uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#if HAVE_ALTIVEC
#if HAVE_SYS_SYSCTL_H
# include <sys/sysctl.h>
#endif
#if HAVE_SYS_SYSCTL_H && HAVE_SYSCTL && defined(CTL_HW) && defined(HW_VECTORUNIT)
/* code for darwin */
static int really_have_altivec(void)
{
int mib[2], altivecp;
size_t len;
mib[0] = CTL_HW;
mib[1] = HW_VECTORUNIT;
len = sizeof(altivecp);
sysctl(mib, 2, &altivecp, &len, NULL, 0);
return altivecp;
}
#else /* GNU/Linux and other non-Darwin systems (!HAVE_SYS_SYSCTL_H etc.) */
#include <signal.h>
#include <setjmp.h>
static jmp_buf jb;
static void sighandler(int x)
{
longjmp(jb, 1);
}
static int really_have_altivec(void)
{
void (*oldsig)(int);
oldsig = signal(SIGILL, sighandler);
if (setjmp(jb)) {
signal(SIGILL, oldsig);
return 0;
} else {
__asm__ __volatile__ (".long 0x10000484"); /* vor 0,0,0 */
signal(SIGILL, oldsig);
return 1;
}
return 0;
}
#endif
int X(have_simd_altivec)(void)
{
static int init = 0, res;
if (!init) {
res = really_have_altivec();
init = 1;
}
return res;
}
#endif

View File

@@ -0,0 +1,148 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifdef _MSC_VER
#ifndef inline
#define inline __inline
#endif
#endif
#ifdef _MSC_VER
#include <intrin.h>
#if (_MSC_VER >= 1600) && !defined(__INTEL_COMPILER)
#include <immintrin.h>
#endif
#endif
/* cpuid version to get all registers. Donated by Erik Lindahl from Gromacs. */
static inline void
cpuid_all(int level, int ecxval, int *eax, int *ebx, int *ecx, int *edx)
{
# ifdef _MSC_VER
int CPUInfo[4];
#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
/* MSVC 9.0 SP1 or later */
__cpuidex(CPUInfo, level, ecxval);
#else
__cpuid(CPUInfo, level);
#endif
*eax = CPUInfo[0];
*ebx = CPUInfo[1];
*ecx = CPUInfo[2];
*edx = CPUInfo[3];
# else
/* Not MSVC */
*eax = level;
*ecx = ecxval;
*ebx = 0;
*edx = 0;
/* No need to save ebx if we are not in pic mode */
__asm__ ("cpuid \n\t"
: "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
# endif
}
static inline int cpuid_ecx(int op)
{
# ifdef _MSC_VER
# ifdef __INTEL_COMPILER
int result;
_asm {
push rbx
mov eax,op
cpuid
mov result,ecx
pop rbx
}
return result;
# else
int cpu_info[4];
__cpuid(cpu_info,op);
return cpu_info[2];
# endif
# else
int eax, ecx = 0, edx;
__asm__("pushq %%rbx\n\tcpuid\n\tpopq %%rbx"
: "=a" (eax), "+c" (ecx), "=d" (edx)
: "a" (op));
return ecx;
# endif
}
static inline int cpuid_ebx(int op)
{
# ifdef _MSC_VER
# ifdef __INTEL_COMPILER
int result;
_asm {
push rbx
mov eax,op
cpuid
mov result,ebx
pop rbx
}
return result;
# else
int cpu_info[4];
__cpuid(cpu_info,op);
return cpu_info[1];
# endif
# else
int eax, ecx = 0, edx;
__asm__("pushq %%rbx\n\tcpuid\nmov %%ebx,%%ecx\n\tpopq %%rbx"
: "=a" (eax), "+c" (ecx), "=d" (edx)
: "a" (op));
return ecx;
# endif
}
static inline int xgetbv_eax(int op)
{
# ifdef _MSC_VER
# ifdef __INTEL_COMPILER
int veax, vedx;
_asm {
mov ecx,op
xgetbv
mov veax,eax
mov vedx,edx
}
return veax;
# else
# if defined(_MSC_VER) && (_MSC_VER >= 1600)
unsigned __int64 result;
result = _xgetbv(op);
return (int)result;
# else
# error "Need at least Visual Studio 10 SP1 for AVX support"
# endif
# endif
# else
int eax, edx;
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
return eax;
#endif
}

View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#if HAVE_AVX_128_FMA
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
# include "amd64-cpuid.h"
#else
# include "x86-cpuid.h"
#endif
int X(have_simd_avx_128_fma)(void)
{
static int init = 0, res = 0;
int eax,ebx,ecx,edx;
if (!init)
{
/* Check if this is an AMD CPU */
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
/* 0x68747541: "Auth" , 0x444d4163: "enti" , 0x69746e65: "cAMD" */
if (ebx==0x68747541 && ecx==0x444d4163 && edx==0x69746e65)
{
/* OK, this is an AMD CPU. Check if we support FMA4 */
cpuid_all(0x80000001,0,&eax,&ebx,&ecx,&edx);
if(ecx & (1<<16))
{
res = 1;
}
}
init = 1;
}
return res;
}
#endif

View File

@@ -0,0 +1,54 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#if HAVE_AVX
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
# include "amd64-cpuid.h"
#else
# include "x86-cpuid.h"
#endif
int X(have_simd_avx)(void)
{
static int init = 0, res = 0;
int max_stdfn, eax, ebx, ecx, edx;
if (!init) {
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
max_stdfn = eax;
if (max_stdfn >= 0x1) {
/* have AVX and OSXSAVE? (implies XGETBV exists) */
cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
if ((ecx & 0x18000000) == 0x18000000) {
/* have OS support for XMM, YMM? */
res = ((xgetbv_eax(0) & 0x6) == 0x6);
}
}
init = 1;
}
return res;
}
#endif

View File

@@ -0,0 +1,68 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#if HAVE_AVX2
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
# include "amd64-cpuid.h"
#else
# include "x86-cpuid.h"
#endif
int X(have_simd_avx2_128)(void)
{
static int init = 0, res;
int max_stdfn, eax, ebx, ecx, edx;
if (!init) {
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
max_stdfn = eax;
if (max_stdfn >= 0x1) {
/* have AVX and OSXSAVE? (implies XGETBV exists) */
cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
if ((ecx & 0x18000000) == 0x18000000) {
/* have AVX2? */
cpuid_all(7,0,&eax,&ebx,&ecx,&edx);
if (ebx & (1 << 5)) {
/* have OS support for XMM, YMM? */
res = ((xgetbv_eax(0) & 0x6) == 0x6);
}
}
}
init = 1;
}
return res;
}
int X(have_simd_avx2)(void)
{
/*
* For now 256-bit AVX2 support is identical to 128-bit.
* This might change in the future if AMD released AVX2-capable
* chips that work better with the 128-bit flavor, but since AMD
* might actually change it to implement 256-bit AVX2 efficiently
* by then we don't want to disable it before we know.
*/
return X(have_simd_avx2_128)();
}
#endif

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2003, 2007-11 Matteo Frigo
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
* Copyright (c) 2012-2013 Romain Dolbeau
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#include "kernel/ifftw.h"
#if HAVE_AVX512
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
#include "amd64-cpuid.h"
int X(have_simd_avx512)(void)
{
static int init = 0, res;
int max_stdfn, eax, ebx, ecx, edx;
/* NOTE: this code is a total guess. I don't have an avx512
machine available. The code contributed by Erik Lindahl would
crash on a machine without XGETBV, so I had to guess a fix. */
if (!init) {
cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
max_stdfn = eax;
if (max_stdfn >= 0x1) {
/* have OSXSAVE? (implies XGETBV exists) */
cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
if ((ecx & 0x08000000) == 0x08000000) {
/* have AVX512? */
cpuid_all(7,0,&eax,&ebx,&ecx,&edx);
if (ebx & (1 << 16)) {
/* have OS support for XMM, YMM, ZMM */
int zmm_ymm_xmm = (7 << 5) | (1 << 2) | (1 << 1);
res = ((xgetbv_eax(0) & zmm_ymm_xmm) == zmm_ymm_xmm);
}
}
}
init = 1;
}
return res;
}
#else /* 32-bit code */
#error "Avx512 is 64 bits only"
#endif
#endif

View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) 2003, 2007-11 Matteo Frigo
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
* Copyright (c) 2012-2013 Romain Dolbeau
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#include "kernel/ifftw.h"
#if HAVE_KCVI
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
#include "amd64-cpuid.h"
int X(have_simd_kcvi)(void)
{
static int init = 0, res;
if (!init) {
res = 1;
init = 1;
}
return res;
}
#else /* 32-bit code */
#error "KCvi is 64 bits only"
#endif
#endif

View File

@@ -0,0 +1,76 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#if HAVE_NEON
/* check for an environment where signals are known to work */
#if defined(unix) || defined(linux)
# include <signal.h>
# include <setjmp.h>
static jmp_buf jb;
static void sighandler(int x)
{
UNUSED(x);
longjmp(jb, 1);
}
static int really_have_neon(void)
{
void (*oldsig)(int);
oldsig = signal(SIGILL, sighandler);
if (setjmp(jb)) {
signal(SIGILL, oldsig);
return 0;
} else {
/* paranoia: encode the instruction in binary because the
assembler may not recognize it without -mfpu=neon */
/*asm volatile ("vand q0, q0, q0");*/
asm volatile (".long 0xf2000150");
signal(SIGILL, oldsig);
return 1;
}
}
int X(have_simd_neon)(void)
{
static int init = 0, res;
if (!init) {
res = really_have_neon();
init = 1;
}
return res;
}
#else
/* don't know how to autodetect NEON; assume it is present */
int X(have_simd_neon)(void)
{
return 1;
}
#endif
#endif

View File

@@ -0,0 +1,297 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef FFTW_SINGLE
#error "ALTIVEC only works in single precision"
#endif
/* define these unconditionally, because they are used by
taint.c which is compiled without altivec */
#define SIMD_SUFFIX _altivec /* for renaming */
#define VL 2 /* SIMD complex vector length */
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA
#if !defined(__VEC__) && !defined(FAKE__VEC__)
# error "compiling simd-altivec.h requires -maltivec or equivalent"
#endif
#ifdef HAVE_ALTIVEC_H
# include <altivec.h>
#endif
typedef vector float V;
#define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
#define LDK(x) x
#define DVK(var, val) const V var = VLIT(val, val, val, val)
static inline V VADD(V a, V b) { return vec_add(a, b); }
static inline V VSUB(V a, V b) { return vec_sub(a, b); }
static inline V VFMA(V a, V b, V c) { return vec_madd(a, b, c); }
static inline V VFNMS(V a, V b, V c) { return vec_nmsub(a, b, c); }
static inline V VMUL(V a, V b)
{
DVK(zero, -0.0);
return VFMA(a, b, zero);
}
static inline V VFMS(V a, V b, V c) { return VSUB(VMUL(a, b), c); }
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
UNUSED(ivs);
UNUSED(aligned_like);
return vec_ld(0, x);
}
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
/* common subexpressions */
const INT fivs = sizeof(R) * ivs;
/* you are not expected to understand this: */
const vector unsigned int perm = VLIT(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
vector unsigned char ml = vec_lvsr(fivs + 8, aligned_like);
vector unsigned char mh = vec_lvsl(0, aligned_like);
vector unsigned char msk =
(vector unsigned char)vec_sel((V)mh, (V)ml, perm);
/* end of common subexpressions */
return vec_perm(vec_ld(0, x), vec_ld(fivs, x), msk);
}
/* store lower half */
static inline void STH(R *x, V v, R *aligned_like)
{
v = vec_perm(v, v, vec_lvsr(0, aligned_like));
vec_ste(v, 0, x);
vec_ste(v, sizeof(R), x);
}
static inline void STL(R *x, V v, INT ovs, R *aligned_like)
{
const INT fovs = sizeof(R) * ovs;
v = vec_perm(v, v, vec_lvsr(fovs + 8, aligned_like));
vec_ste(v, fovs, x);
vec_ste(v, sizeof(R) + fovs, x);
}
static inline void STA(R *x, V v, INT ovs, R *aligned_like)
{
UNUSED(ovs);
UNUSED(aligned_like);
vec_st(v, 0, x);
}
static inline void ST(R *x, V v, INT ovs, R *aligned_like)
{
/* WARNING: the extra_iter hack depends upon STH occurring after
STL */
STL(x, v, ovs, aligned_like);
STH(x, v, aligned_like);
}
#define STM2(x, v, ovs, aligned_like) /* no-op */
static inline void STN2(R *x, V v0, V v1, INT ovs)
{
const INT fovs = sizeof(R) * ovs;
const vector unsigned int even =
VLIT(0x00010203, 0x04050607, 0x10111213, 0x14151617);
const vector unsigned int odd =
VLIT(0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f);
vec_st(vec_perm(v0, v1, (vector unsigned char)even), 0, x);
vec_st(vec_perm(v0, v1, (vector unsigned char)odd), fovs, x);
}
#define STM4(x, v, ovs, aligned_like) /* no-op */
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
{
const INT fovs = sizeof(R) * ovs;
V x0 = vec_mergeh(v0, v2);
V x1 = vec_mergel(v0, v2);
V x2 = vec_mergeh(v1, v3);
V x3 = vec_mergel(v1, v3);
V y0 = vec_mergeh(x0, x2);
V y1 = vec_mergel(x0, x2);
V y2 = vec_mergeh(x1, x3);
V y3 = vec_mergel(x1, x3);
vec_st(y0, 0, x);
vec_st(y1, fovs, x);
vec_st(y2, 2 * fovs, x);
vec_st(y3, 3 * fovs, x);
}
static inline V FLIP_RI(V x)
{
const vector unsigned int perm =
VLIT(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
return vec_perm(x, x, (vector unsigned char)perm);
}
static inline V VCONJ(V x)
{
const V pmpm = VLIT(0.0, -0.0, 0.0, -0.0);
return vec_xor(x, pmpm);
}
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
static inline V VFMAI(V b, V c)
{
const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
return VFMA(FLIP_RI(b), mpmp, c);
}
static inline V VFNMSI(V b, V c)
{
const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
return VFNMS(FLIP_RI(b), mpmp, c);
}
static inline V VFMACONJ(V b, V c)
{
const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
return VFMA(b, pmpm, c);
}
static inline V VFNMSCONJ(V b, V c)
{
const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
return VFNMS(b, pmpm, c);
}
static inline V VFMSCONJ(V b, V c)
{
return VSUB(VCONJ(b), c);
}
static inline V VZMUL(V tx, V sr)
{
const vector unsigned int real =
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
const vector unsigned int imag =
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
V si = VBYI(sr);
V tr = vec_perm(tx, tx, (vector unsigned char)real);
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
return VFMA(ti, si, VMUL(tr, sr));
}
static inline V VZMULJ(V tx, V sr)
{
const vector unsigned int real =
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
const vector unsigned int imag =
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
V si = VBYI(sr);
V tr = vec_perm(tx, tx, (vector unsigned char)real);
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
return VFNMS(ti, si, VMUL(tr, sr));
}
static inline V VZMULI(V tx, V si)
{
const vector unsigned int real =
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
const vector unsigned int imag =
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
V sr = VBYI(si);
V tr = vec_perm(tx, tx, (vector unsigned char)real);
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
return VFNMS(ti, si, VMUL(tr, sr));
}
static inline V VZMULIJ(V tx, V si)
{
const vector unsigned int real =
VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
const vector unsigned int imag =
VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
V sr = VBYI(si);
V tr = vec_perm(tx, tx, (vector unsigned char)real);
V ti = vec_perm(tx, tx, (vector unsigned char)imag);
return VFMA(ti, si, VMUL(tr, sr));
}
/* twiddle storage #1: compact, slower */
#define VTW1(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#define TWVL1 (VL)
static inline V BYTW1(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = VBYI(sr);
V tx = twp[0];
V tr = vec_mergeh(tx, tx);
V ti = vec_mergel(tx, tx);
return VFMA(ti, si, VMUL(tr, sr));
}
static inline V BYTWJ1(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = VBYI(sr);
V tx = twp[0];
V tr = vec_mergeh(tx, tx);
V ti = vec_mergel(tx, tx);
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(ti, si, VMUL(tr, sr));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
#define TWVL3 (VL)
/* twiddle storage for split arrays */
#define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,332 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* 128-bit AVX support by Erik Lindahl, 2015.
* Erik Lindahl hereby places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "AVX only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## s
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## d
#endif
#define SIMD_SUFFIX _avx_128_fma /* for renaming */
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#ifdef _MSC_VER
#ifndef inline
#define inline __inline
#endif
#endif
#include <immintrin.h>
#ifdef _MSC_VER
# include <intrin.h>
#elif defined (__GNUC__)
# include <x86intrin.h>
#endif
#if !(defined(__AVX__) && defined(__FMA4__)) /* sanity check */
#error "compiling simd-avx-128-fma.h without -mavx or -mfma4"
#endif
typedef DS(__m128d,__m128) V;
#define VADD SUFF(_mm_add_p)
#define VSUB SUFF(_mm_sub_p)
#define VMUL SUFF(_mm_mul_p)
#define VXOR SUFF(_mm_xor_p)
#define SHUF SUFF(_mm_shuffle_p)
#define VPERM1 SUFF(_mm_permute_p)
#define UNPCKL SUFF(_mm_unpacklo_p)
#define UNPCKH SUFF(_mm_unpackhi_p)
#define SHUFVALS(fp0,fp1,fp2,fp3) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))
#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))
#define DVK(var, val) V var = VLIT(val, val)
#define LDK(x) x
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return *(const V *)x;
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(V *)x = v;
}
#ifdef FFTW_SINGLE
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
V var;
#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
var = LOADL(x, SUFF(_mm_undefined_p)());
var = LOADH(x + ivs, var);
#else
var = LOADL(x, var);
var = LOADH(x + ivs, var);
#endif
return var;
}
# ifdef _MSC_VER
# pragma warning(default : 4700)
# pragma runtime_checks("u", restore)
# endif
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon STOREL occurring
after STOREH */
STOREH(x + ovs, v);
STOREL(x, v);
}
#else /* ! FFTW_SINGLE */
# define LD LDA
# define ST STA
#endif
#define STM2 DS(STA,ST)
#define STN2(x, v0, v1, ovs) /* nop */
#ifdef FFTW_SINGLE
# define STM4(x, v, ovs, aligned_like) /* no-op */
/* STN4 is a macro, not a function, thanks to Visual C++ developers
deciding "it would be infrequent that people would want to pass more
than 3 [__m128 parameters] by value." 3 parameters ought to be enough
for anybody. */
# define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
xxx0 = UNPCKL(v0, v2); \
xxx1 = UNPCKH(v0, v2); \
xxx2 = UNPCKL(v1, v3); \
xxx3 = UNPCKH(v1, v3); \
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
}
#else /* !FFTW_SINGLE */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
STOREL(x, v);
STOREH(x + ovs, v);
}
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#endif
static inline V FLIP_RI(V x)
{
return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));
}
static inline V VCONJ(V x)
{
/* Produce a SIMD vector[VL] of (0 + -0i).
We really want to write this:
V pmpm = VLIT(-0.0, 0.0);
but historically some compilers have ignored the distiction
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
as 0 too.
*/
union uvec {
unsigned u[4];
V v;
};
static const union uvec pmpm = {
#ifdef FFTW_SINGLE
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
#else
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
#endif
};
return VXOR(pmpm.v, x);
}
static inline V VBYI(V x)
{
x = VCONJ(x);
x = FLIP_RI(x);
return x;
}
/* FMA support */
#define VFMA(a, b, c) SUFF(_mm_macc_p)(a,b,c)
#define VFNMS(a, b, c) SUFF(_mm_nmacc_p)(a,b,c)
#define VFMS(a, b, c) SUFF(_mm_msub_p)(a,b,c)
#define VFMAI(b, c) SUFF(_mm_addsub_p)(c,FLIP_RI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(tr, sr);
ti = VMUL(ti, FLIP_RI(sr));
return SUFF(_mm_addsub_p)(tr,ti);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(tr, sr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
tr = VMUL(tr, FLIP_RI(sr));
return SUFF(_mm_addsub_p)(ti,tr);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
static inline V BYTW1(const R *t, V sr)
{
const V *twp = (const V *)t;
V tx = twp[0];
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
tr = VMUL(tr, sr);
ti = VMUL(ti, FLIP_RI(sr));
return SUFF(_mm_addsub_p)(tr,ti);
}
static inline V BYTWJ1(const R *t, V sr)
{
const V *twp = (const V *)t;
V tx = twp[0];
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
tr = VMUL(tr, sr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}
static inline V BYTW1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMULJ(tx, sr);
}
#endif
#define TWVL1 (VL)
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#ifdef FFTW_SINGLE
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
# define TWVL3 (VL)
#else
# define VTW3(v,x) VTW1(v,x)
# define TWVL3 TWVL1
#endif
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,404 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "AVX only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## s
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## d
#endif
#define SIMD_SUFFIX _avx /* for renaming */
#define VL DS(2, 4) /* SIMD complex vector length */
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#if defined(__GNUC__) && !defined(__AVX__) /* sanity check */
#error "compiling simd-avx.h without -mavx"
#endif
#ifdef _MSC_VER
#ifndef inline
#define inline __inline
#endif
#endif
#include <immintrin.h>
typedef DS(__m256d, __m256) V;
#define VADD SUFF(_mm256_add_p)
#define VSUB SUFF(_mm256_sub_p)
#define VMUL SUFF(_mm256_mul_p)
#define VXOR SUFF(_mm256_xor_p)
#define VSHUF SUFF(_mm256_shuffle_p)
#define SHUFVALD(fp0,fp1) \
(((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
#define SHUFVALS(fp0,fp1,fp2,fp3) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define VDUPL(x) DS(_mm256_unpacklo_pd(x, x), VSHUF(x, x, SHUFVALS(0, 0, 2, 2)))
#define VDUPH(x) DS(_mm256_unpackhi_pd(x, x), VSHUF(x, x, SHUFVALS(1, 1, 3, 3)))
#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
#define DVK(var, val) V var = VLIT(val, val)
#define LDK(x) x
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return SUFF(_mm256_loadu_p)(x);
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
SUFF(_mm256_storeu_p)(x, v);
}
#if FFTW_SINGLE
# ifdef _MSC_VER
/* Temporarily disable the warning "uninitialized local variable
'name' used" and runtime checks for using a variable before it is
defined which is erroneously triggered by the LOADL0 / LOADH macros
as they only modify VAL partly each. */
# ifndef __INTEL_COMPILER
# pragma warning(disable : 4700)
# pragma runtime_checks("u", off)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(disable : 592)
# endif
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
/* it seems like the only AVX way to store 4 complex floats is to
extract two pairs of complex floats into two __m128 registers, and
then use SSE-like half-stores. Similarly, to load 4 complex
floats, we load two pairs of complex floats into two __m128
registers, and then pack the two __m128 registers into one __m256
value. */
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
__m128 l, h;
V v;
(void)aligned_like; /* UNUSED */
l = LOADL(x, l);
l = LOADH(x + ivs, l);
h = LOADL(x + 2*ivs, h);
h = LOADH(x + 3*ivs, h);
v = _mm256_castps128_ps256(l);
v = _mm256_insertf128_ps(v, h, 1);
return v;
}
# ifdef _MSC_VER
# ifndef __INTEL_COMPILER
# pragma warning(default : 4700)
# pragma runtime_checks("u", restore)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(default : 592)
# endif
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
__m128 h = _mm256_extractf128_ps(v, 1);
__m128 l = _mm256_castps256_ps128(v);
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon STOREL occurring
after STOREH */
STOREH(x + 3*ovs, h);
STOREL(x + 2*ovs, h);
STOREH(x + ovs, l);
STOREL(x, l);
}
#define STM2(x, v, ovs, aligned_like) /* no-op */
static inline void STN2(R *x, V v0, V v1, INT ovs)
{
V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
__m128 h0 = _mm256_extractf128_ps(x0, 1);
__m128 l0 = _mm256_castps256_ps128(x0);
__m128 h1 = _mm256_extractf128_ps(x1, 1);
__m128 l1 = _mm256_castps256_ps128(x1);
*(__m128 *)(x + 3*ovs) = h1;
*(__m128 *)(x + 2*ovs) = h0;
*(__m128 *)(x + 1*ovs) = l1;
*(__m128 *)(x + 0*ovs) = l0;
}
#define STM4(x, v, ovs, aligned_like) /* no-op */
#define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
V yyy0, yyy1, yyy2, yyy3; \
xxx0 = _mm256_unpacklo_ps(v0, v2); \
xxx1 = _mm256_unpackhi_ps(v0, v2); \
xxx2 = _mm256_unpacklo_ps(v1, v3); \
xxx3 = _mm256_unpackhi_ps(v1, v3); \
yyy0 = _mm256_unpacklo_ps(xxx0, xxx2); \
yyy1 = _mm256_unpackhi_ps(xxx0, xxx2); \
yyy2 = _mm256_unpacklo_ps(xxx1, xxx3); \
yyy3 = _mm256_unpackhi_ps(xxx1, xxx3); \
*(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0); \
*(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1); \
*(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1); \
*(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1); \
*(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2); \
*(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1); \
*(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3); \
*(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1); \
}
#else
static inline __m128d VMOVAPD_LD(const R *x)
{
/* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
into a 256-bit vmovapd, which requires 32-byte aligment instead of
16-byte alignment.
Force the use of vmovapd via asm until compilers stabilize.
*/
#if defined(__GNUC__)
__m128d var;
__asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
return var;
#else
return *(const __m128d *)x;
#endif
}
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
V var;
(void)aligned_like; /* UNUSED */
var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
return var;
}
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon the store of the low
part occurring after the store of the high part */
*(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
*(__m128d *)x = _mm256_castpd256_pd128(v);
}
#define STM2 ST
#define STN2(x, v0, v1, ovs) /* nop */
#define STM4(x, v, ovs, aligned_like) /* no-op */
/* STN4 is a macro, not a function, thanks to Visual C++ developers
deciding "it would be infrequent that people would want to pass more
than 3 [__m128 parameters] by value." Even though the comment
was made about __m128 parameters, it appears to apply to __m256
parameters as well. */
#define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
xxx0 = _mm256_unpacklo_pd(v0, v1); \
xxx1 = _mm256_unpackhi_pd(v0, v1); \
xxx2 = _mm256_unpacklo_pd(v2, v3); \
xxx3 = _mm256_unpackhi_pd(v2, v3); \
STA(x, _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
STA(x + ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
}
#endif
static inline V FLIP_RI(V x)
{
return VSHUF(x, x,
DS(SHUFVALD(1, 0),
SHUFVALS(1, 0, 3, 2)));
}
static inline V VCONJ(V x)
{
/* Produce a SIMD vector[VL] of (0 + -0i).
We really want to write this:
V pmpm = VLIT(-0.0, 0.0);
but historically some compilers have ignored the distiction
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
as 0 too.
*/
union uvec {
unsigned u[8];
V v;
};
static const union uvec pmpm = {
#ifdef FFTW_SINGLE
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000,
0x00000000, 0x80000000, 0x00000000, 0x80000000 }
#else
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x80000000 }
#endif
};
return VXOR(pmpm.v, x);
}
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
/* FMA support */
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
#define VFMAI(b, c) VADD(c, VBYI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
#else
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
#endif
#define TWVL1 (VL)
static inline V BYTW1(const R *t, V sr)
{
return VZMUL(LDA(t, 2, t), sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
return VZMULJ(LDA(t, 2, t), sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
#else
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#define VTW3 VTW1
#define TWVL3 TWVL1
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#endif
#define TWVLS (2 * VL)
/* Use VZEROUPPER to avoid the penalty of switching from AVX to SSE.
See Intel Optimization Manual (April 2011, version 248966), Section
11.3 */
#define VLEAVE _mm256_zeroupper
#include "simd-common.h"

View File

@@ -0,0 +1,342 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* 128-bit AVX2 support by Erik Lindahl, 2015.
* Erik Lindahl hereby places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "AVX2 only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## s
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## d
#endif
#define SIMD_SUFFIX _avx2_128 /* for renaming */
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#if defined(__GNUC__) && !defined(__AVX2__) /* sanity check */
#error "compiling simd-avx2-128.h without avx2 support"
#endif
#ifdef _MSC_VER
#ifndef inline
#define inline __inline
#endif
#endif
#include <immintrin.h>
typedef DS(__m128d,__m128) V;
#define VADD SUFF(_mm_add_p)
#define VSUB SUFF(_mm_sub_p)
#define VMUL SUFF(_mm_mul_p)
#define VXOR SUFF(_mm_xor_p)
#define SHUF SUFF(_mm_shuffle_p)
#define VPERM1 SUFF(_mm_permute_p)
#define UNPCKL SUFF(_mm_unpacklo_p)
#define UNPCKH SUFF(_mm_unpackhi_p)
#define SHUFVALS(fp0,fp1,fp2,fp3) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))
#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))
#define DVK(var, val) V var = VLIT(val, val)
#define LDK(x) x
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return *(const V *)x;
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(V *)x = v;
}
#ifdef FFTW_SINGLE
# ifdef _MSC_VER
/* Temporarily disable the warning "uninitialized local variable
'name' used" and runtime checks for using a variable before it is
defined which is erroneously triggered by the LOADL0 / LOADH macros
as they only modify VAL partly each. */
# ifndef __INTEL_COMPILER
# pragma warning(disable : 4700)
# pragma runtime_checks("u", off)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(disable : 592)
# endif
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
__m128 l0, l1;
(void)aligned_like; /* UNUSED */
#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
l0 = LOADL(x, SUFF(_mm_undefined_p)());
l1 = LOADL(x + ivs, SUFF(_mm_undefined_p)());
#else
l0 = LOADL(x, l0);
l1 = LOADL(x + ivs, l1);
#endif
return SUFF(_mm_movelh_p)(l0,l1);
}
# ifdef _MSC_VER
# ifndef __INTEL_COMPILER
# pragma warning(default : 4700)
# pragma runtime_checks("u", restore)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(default : 592)
# endif
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon STOREL occurring
after STOREH */
STOREH(x + ovs, v);
STOREL(x, v);
}
#else /* ! FFTW_SINGLE */
# define LD LDA
# define ST STA
#endif
#define STM2 DS(STA,ST)
#define STN2(x, v0, v1, ovs) /* nop */
#ifdef FFTW_SINGLE
# define STM4(x, v, ovs, aligned_like) /* no-op */
/* STN4 is a macro, not a function, thanks to Visual C++ developers
deciding "it would be infrequent that people would want to pass more
than 3 [__m128 parameters] by value." 3 parameters ought to be enough
for anybody. */
# define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
xxx0 = UNPCKL(v0, v2); \
xxx1 = UNPCKH(v0, v2); \
xxx2 = UNPCKL(v1, v3); \
xxx3 = UNPCKH(v1, v3); \
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
}
#else /* !FFTW_SINGLE */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
STOREL(x, v);
STOREH(x + ovs, v);
}
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#endif
static inline V FLIP_RI(V x)
{
return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));
}
static inline V VCONJ(V x)
{
/* Produce a SIMD vector[VL] of (0 + -0i).
We really want to write this:
V pmpm = VLIT(-0.0, 0.0);
but historically some compilers have ignored the distiction
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
as 0 too.
*/
union uvec {
unsigned u[4];
V v;
};
static const union uvec pmpm = {
#ifdef FFTW_SINGLE
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
#else
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
#endif
};
return VXOR(pmpm.v, x);
}
static inline V VBYI(V x)
{
x = VCONJ(x);
x = FLIP_RI(x);
return x;
}
/* FMA support */
#define VFMA(a, b, c) SUFF(_mm_fmadd_p)(a,b,c)
#define VFNMS(a, b, c) SUFF(_mm_fnmadd_p)(a,b,c)
#define VFMS(a, b, c) SUFF(_mm_fmsub_p)(a,b,c)
#define VFMAI(b, c) SUFF(_mm_addsub_p)(c,FLIP_RI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, FLIP_RI(sr));
return SUFF(_mm_fmaddsub_p)(tr,sr,ti);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, FLIP_RI(sr));
return SUFF(_mm_fmsubadd_p)(tr,sr,ti);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(tr, FLIP_RI(sr));
return SUFF(_mm_fmaddsub_p)(ti,sr,tr);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
static inline V BYTW1(const R *t, V sr)
{
const V *twp = (const V *)t;
V tx = twp[0];
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
ti = VMUL(ti, FLIP_RI(sr));
return SUFF(_mm_fmaddsub_p)(tr,sr,ti);
}
static inline V BYTWJ1(const R *t, V sr)
{
const V *twp = (const V *)t;
V tx = twp[0];
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
ti = VMUL(ti, FLIP_RI(sr));
return SUFF(_mm_fmsubadd_p)(tr,sr,ti);
}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}
static inline V BYTW1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMULJ(tx, sr);
}
#endif
#define TWVL1 (VL)
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#ifdef FFTW_SINGLE
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
# define TWVL3 (VL)
#else
# define VTW3(v,x) VTW1(v,x)
# define TWVL3 TWVL1
#endif
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,414 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* Modifications by Romain Dolbeau & Erik Lindahl, derived from simd-avx.h
* Romain Dolbeau hereby places his modifications in the public domain.
* Erik Lindahl hereby places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "AVX2 only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## s
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## d
#endif
#define SIMD_SUFFIX _avx2 /* for renaming */
#define VL DS(2, 4) /* SIMD complex vector length */
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#if defined(__GNUC__) && !defined(__AVX2__) /* sanity check */
#error "compiling simd-avx2.h without avx2 support"
#endif
#ifdef _MSC_VER
#ifndef inline
#define inline __inline
#endif
#endif
#include <immintrin.h>
typedef DS(__m256d, __m256) V;
#define VADD SUFF(_mm256_add_p)
#define VSUB SUFF(_mm256_sub_p)
#define VMUL SUFF(_mm256_mul_p)
#define VXOR SUFF(_mm256_xor_p)
#define VSHUF SUFF(_mm256_shuffle_p)
#define VPERM1 SUFF(_mm256_permute_p)
#define SHUFVALD(fp0,fp1) \
(((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
#define SHUFVALS(fp0,fp1,fp2,fp3) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define VDUPL(x) DS(_mm256_movedup_pd(x), _mm256_moveldup_ps(x))
#define VDUPH(x) DS(_mm256_permute_pd(x,SHUFVALD(1,1)), _mm256_movehdup_ps(x))
#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
#define DVK(var, val) V var = VLIT(val, val)
#define LDK(x) x
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return SUFF(_mm256_loadu_p)(x);
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
SUFF(_mm256_storeu_p)(x, v);
}
#if FFTW_SINGLE
# ifdef _MSC_VER
/* Temporarily disable the warning "uninitialized local variable
'name' used" and runtime checks for using a variable before it is
defined which is erroneously triggered by the LOADL0 / LOADH macros
as they only modify VAL partly each. */
# ifndef __INTEL_COMPILER
# pragma warning(disable : 4700)
# pragma runtime_checks("u", off)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(disable : 592)
# endif
#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
__m128 l0, l1, h0, h1;
(void)aligned_like; /* UNUSED */
#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
l0 = LOADL(x, SUFF(_mm_undefined_p)());
l1 = LOADL(x + ivs, SUFF(_mm_undefined_p)());
h0 = LOADL(x + 2*ivs, SUFF(_mm_undefined_p)());
h1 = LOADL(x + 3*ivs, SUFF(_mm_undefined_p)());
#else
l0 = LOADL(x, l0);
l1 = LOADL(x + ivs, l1);
h0 = LOADL(x + 2*ivs, h0);
h1 = LOADL(x + 3*ivs, h1);
#endif
l0 = SUFF(_mm_movelh_p)(l0,l1);
h0 = SUFF(_mm_movelh_p)(h0,h1);
return _mm256_insertf128_ps(_mm256_castps128_ps256(l0), h0, 1);
}
# ifdef _MSC_VER
# ifndef __INTEL_COMPILER
# pragma warning(default : 4700)
# pragma runtime_checks("u", restore)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(default : 592)
# endif
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
__m128 h = _mm256_extractf128_ps(v, 1);
__m128 l = _mm256_castps256_ps128(v);
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon STOREL occurring
after STOREH */
STOREH(x + 3*ovs, h);
STOREL(x + 2*ovs, h);
STOREH(x + ovs, l);
STOREL(x, l);
}
#define STM2(x, v, ovs, aligned_like) /* no-op */
static inline void STN2(R *x, V v0, V v1, INT ovs)
{
V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
__m128 h0 = _mm256_extractf128_ps(x0, 1);
__m128 l0 = _mm256_castps256_ps128(x0);
__m128 h1 = _mm256_extractf128_ps(x1, 1);
__m128 l1 = _mm256_castps256_ps128(x1);
*(__m128 *)(x + 3*ovs) = h1;
*(__m128 *)(x + 2*ovs) = h0;
*(__m128 *)(x + 1*ovs) = l1;
*(__m128 *)(x + 0*ovs) = l0;
}
#define STM4(x, v, ovs, aligned_like) /* no-op */
#define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
V yyy0, yyy1, yyy2, yyy3; \
xxx0 = _mm256_unpacklo_ps(v0, v2); \
xxx1 = _mm256_unpackhi_ps(v0, v2); \
xxx2 = _mm256_unpacklo_ps(v1, v3); \
xxx3 = _mm256_unpackhi_ps(v1, v3); \
yyy0 = _mm256_unpacklo_ps(xxx0, xxx2); \
yyy1 = _mm256_unpackhi_ps(xxx0, xxx2); \
yyy2 = _mm256_unpacklo_ps(xxx1, xxx3); \
yyy3 = _mm256_unpackhi_ps(xxx1, xxx3); \
*(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0); \
*(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1); \
*(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1); \
*(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1); \
*(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2); \
*(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1); \
*(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3); \
*(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1); \
}
#else
static inline __m128d VMOVAPD_LD(const R *x)
{
/* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
into a 256-bit vmovapd, which requires 32-byte aligment instead of
16-byte alignment.
Force the use of vmovapd via asm until compilers stabilize.
*/
#if defined(__GNUC__)
__m128d var;
__asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
return var;
#else
return *(const __m128d *)x;
#endif
}
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
V var;
(void)aligned_like; /* UNUSED */
var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
return var;
}
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon the store of the low
part occurring after the store of the high part */
*(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
*(__m128d *)x = _mm256_castpd256_pd128(v);
}
#define STM2 ST
#define STN2(x, v0, v1, ovs) /* nop */
#define STM4(x, v, ovs, aligned_like) /* no-op */
/* STN4 is a macro, not a function, thanks to Visual C++ developers
deciding "it would be infrequent that people would want to pass more
than 3 [__m128 parameters] by value." Even though the comment
was made about __m128 parameters, it appears to apply to __m256
parameters as well. */
#define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
xxx0 = _mm256_unpacklo_pd(v0, v1); \
xxx1 = _mm256_unpackhi_pd(v0, v1); \
xxx2 = _mm256_unpacklo_pd(v2, v3); \
xxx3 = _mm256_unpackhi_pd(v2, v3); \
STA(x, _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
STA(x + ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
}
#endif
static inline V FLIP_RI(V x)
{
return VPERM1(x, DS(SHUFVALD(1, 0), SHUFVALS(1, 0, 3, 2)));
}
static inline V VCONJ(V x)
{
/* Produce a SIMD vector[VL] of (0 + -0i).
We really want to write this:
V pmpm = VLIT(-0.0, 0.0);
but historically some compilers have ignored the distiction
between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
as 0 too.
*/
union uvec {
unsigned u[8];
V v;
};
static const union uvec pmpm = {
#ifdef FFTW_SINGLE
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000,
0x00000000, 0x80000000, 0x00000000, 0x80000000 }
#else
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x80000000 }
#endif
};
return VXOR(pmpm.v, x);
}
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
/* FMA support */
#define VFMA SUFF(_mm256_fmadd_p)
#define VFNMS SUFF(_mm256_fnmadd_p)
#define VFMS SUFF(_mm256_fmsub_p)
#define VFMAI(b, c) SUFF(_mm256_addsub_p)(c, FLIP_RI(b)) /* VADD(c, VBYI(b)) */
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) SUFF(_mm256_addsub_p)(c, b) /* VSUB(c, VCONJ(b)) */
static inline V VZMUL(V tx, V sr)
{
/* V tr = VDUPL(tx); */
/* V ti = VDUPH(tx); */
/* tr = VMUL(sr, tr); */
/* sr = VBYI(sr); */
/* return VFMA(ti, sr, tr); */
return SUFF(_mm256_fmaddsub_p)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
}
static inline V VZMULJ(V tx, V sr)
{
/* V tr = VDUPL(tx); */
/* V ti = VDUPH(tx); */
/* tr = VMUL(sr, tr); */
/* sr = VBYI(sr); */
/* return VFNMS(ti, sr, tr); */
return SUFF(_mm256_fmsubadd_p)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
/*
* Keep the old version
* (2 permute, 1 shuffle, 1 constant load (L1), 1 xor, 2 fp), since the below FMA one
* would be 2 permute, 1 shuffle, 1 xor (setzero), 3 fp), but with a longer pipeline.
*
* Alternative new fma version:
* return SUFF(_mm256_addsub_p)(SUFF(_mm256_fnmadd_p)(sr, VDUPH(tx), SUFF(_mm256_setzero_p)()),
* VMUL(FLIP_RI(sr), VDUPL(tx)));
*/
}
static inline V VZMULIJ(V tx, V sr)
{
/* V tr = VDUPL(tx); */
/* V ti = VDUPH(tx); */
/* ti = VMUL(ti, sr); */
/* sr = VBYI(sr); */
/* return VFMA(tr, sr, ti); */
return SUFF(_mm256_fmaddsub_p)(sr, VDUPH(tx), VMUL(FLIP_RI(sr), VDUPL(tx)));
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
#else
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
#endif
#define TWVL1 (VL)
static inline V BYTW1(const R *t, V sr)
{
return VZMUL(LDA(t, 2, t), sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
return VZMULJ(LDA(t, 2, t), sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
#else
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#define VTW3 VTW1
#define TWVL3 TWVL1
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE _mm256_zeroupper
#include "simd-common.h"

View File

@@ -0,0 +1,316 @@
/*
* Copyright (c) 2003, 2007-11 Matteo Frigo
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
*
* AVX-512 support implemented by Romain Dolbeau.
* Romain Dolbeau hereby places his modifications in the public domain.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "AVX-512 vector instructions only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## _ps
# define SCAL(x) x ## f
#else /* !FFTW_SINGLE */
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## _pd
# define SCAL(x) x
#endif /* FFTW_SINGLE */
#define SIMD_SUFFIX _avx512 /* for renaming */
#define VL DS(4, 8) /* SIMD complex vector length */
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#if defined(__GNUC__) && !defined(__AVX512F__) /* sanity check */
#error "compiling simd-avx512.h without avx-512f support"
#endif
#if !defined(HAVE_AVX2)
#warning "You should probably enable AVX2 with --enable-avx2 for AVX-512"
#endif
#include <immintrin.h>
typedef DS(__m512d, __m512) V;
#define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
#define VLIT1(val) SUFF(_mm512_set1)(val)
#define LDK(x) x
#define DVK(var, val) V var = VLIT1(val)
#define VZERO SUFF(_mm512_setzero)()
#define VDUPL(x) DS(_mm512_movedup_pd(x),_mm512_moveldup_ps(x))
#define VDUPH(x) DS(_mm512_unpackhi_pd(x, x),_mm512_movehdup_ps(x))
#define FLIP_RI(x) SUFF(_mm512_shuffle)(x, x, DS(0x55,0xB1))
#define VCONJ(x) SUFF(_mm512_fmsubadd)(VZERO, VZERO, x)
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
#define VADD(a,b) SUFF(_mm512_add)(a,b)
#define VSUB(a,b) SUFF(_mm512_sub)(a,b)
#define VMUL(a,b) SUFF(_mm512_mul)(a,b)
#define VFMA(a, b, c) SUFF(_mm512_fmadd)(a, b, c)
#define VFMS(a, b, c) SUFF(_mm512_fmsub)(a, b, c)
#define VFNMS(a, b, c) SUFF(_mm512_fnmadd)(a, b, c)
#define VFMAI(b, c) SUFF(_mm512_fmaddsub)(VLIT1(1.), c, FLIP_RI(b))
#define VFNMSI(b, c) SUFF(_mm512_fmsubadd)(VLIT1(1.), c, FLIP_RI(b))
#define VFMACONJ(b,c) SUFF(_mm512_fmsubadd)(VLIT1(1.), c, b)
#define VFMSCONJ(b,c) SUFF(_mm512_fmsubadd)(VLIT1(-1.), c, b)
#define VFNMSCONJ(b,c) SUFF(_mm512_fmaddsub)(VLIT1(1.), c, b)
static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return SUFF(_mm512_loadu)(x);
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
SUFF(_mm512_storeu)(x, v);
}
#if FFTW_SINGLE
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
6 * ivs + 1, 6 * ivs,
5 * ivs + 1, 5 * ivs,
4 * ivs + 1, 4 * ivs,
3 * ivs + 1, 3 * ivs,
2 * ivs + 1, 2 * ivs,
1 * ivs + 1, 1 * ivs,
0 * ivs + 1, 0 * ivs);
return _mm512_i32gather_ps(index, x, 4);
}
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
6 * ovs + 1, 6 * ovs,
5 * ovs + 1, 5 * ovs,
4 * ovs + 1, 4 * ovs,
3 * ovs + 1, 3 * ovs,
2 * ovs + 1, 2 * ovs,
1 * ovs + 1, 1 * ovs,
0 * ovs + 1, 0 * ovs);
_mm512_i32scatter_ps(x, index, v, 4);
}
#else /* !FFTW_SINGLE */
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
2 * ivs + 1, 2 * ivs,
1 * ivs + 1, 1 * ivs,
0 * ivs + 1, 0 * ivs);
return _mm512_i32gather_pd(index, x, 8);
}
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
2 * ovs + 1, 2 * ovs,
1 * ovs + 1, 1 * ovs,
0 * ovs + 1, 0 * ovs);
_mm512_i32scatter_pd(x, index, v, 8);
}
#endif /* FFTW_SINGLE */
#define LD LDu
#define ST STu
#ifdef FFTW_SINGLE
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
#define STN2(x, v0, v1, ovs) /* nop */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(15 * ovs, 14 * ovs,
13 * ovs, 12 * ovs,
11 * ovs, 10 * ovs,
9 * ovs, 8 * ovs,
7 * ovs, 6 * ovs,
5 * ovs, 4 * ovs,
3 * ovs, 2 * ovs,
1 * ovs, 0 * ovs);
_mm512_i32scatter_ps(x, index, v, 4);
}
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
#else /* !FFTW_SINGLE */
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
#define STN2(x, v0, v1, ovs) /* nop */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs,
5 * ovs, 4 * ovs,
3 * ovs, 2 * ovs,
1 * ovs, 0 * ovs);
_mm512_i32scatter_pd(x, index, v, 8);
}
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
#endif /* FFTW_SINGLE */
static inline V VZMUL(V tx, V sr)
{
/* V tr = VDUPL(tx); */
/* V ti = VDUPH(tx); */
/* tr = VMUL(sr, tr); */
/* sr = VBYI(sr); */
/* return VFMA(ti, sr, tr); */
return SUFF(_mm512_fmaddsub)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
}
static inline V VZMULJ(V tx, V sr)
{
/* V tr = VDUPL(tx); */
/* V ti = VDUPH(tx); */
/* tr = VMUL(sr, tr); */
/* sr = VBYI(sr); */
/* return VFNMS(ti, sr, tr); */
return SUFF(_mm512_fmsubadd)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
/* return SUFF(_mm512_addsub)(SUFF(_mm512_fnmadd)(sr, VDUPH(tx), VZERO), VMUL(FLIP_RI(sr), VDUPL(tx))); */
}
static inline V VZMULIJ(V tx, V sr)
{
/* V tr = VDUPL(tx); */
/* V ti = VDUPH(tx); */
/* ti = VMUL(ti, sr); */
/* sr = VBYI(sr); */
/* return VFMA(tr, sr, ti); */
return SUFF(_mm512_fmaddsub)(sr, VDUPH(tx), VMUL(FLIP_RI(sr), VDUPL(tx)));
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
#endif /* FFTW_SINGLE */
#define TWVL1 (VL)
static inline V BYTW1(const R *t, V sr)
{
return VZMUL(LDA(t, 2, t), sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
return VZMULJ(LDA(t, 2, t), sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
#endif /* FFTW_SINGLE */
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#define VTW3(v,x) VTW1(v,x)
#define TWVL3 TWVL1
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v , x}, {TW_COS, v+1 , x}, {TW_COS, v+2 , x}, {TW_COS, v+3 , x}, \
{TW_COS, v+4 , x}, {TW_COS, v+5 , x}, {TW_COS, v+6 , x}, {TW_COS, v+7 , x}, \
{TW_COS, v+8 , x}, {TW_COS, v+9 , x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
{TW_SIN, v , x}, {TW_SIN, v+1 , x}, {TW_SIN, v+2 , x}, {TW_SIN, v+3 , x}, \
{TW_SIN, v+4 , x}, {TW_SIN, v+5 , x}, {TW_SIN, v+6 , x}, {TW_SIN, v+7 , x}, \
{TW_SIN, v+8 , x}, {TW_SIN, v+9 , x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}
#else /* !FFTW_SINGLE */
# define VTWS(v,x) \
{TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
{TW_SIN, v , x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
#endif /* FFTW_SINGLE */
#define TWVLS (2 * VL)
#define VLEAVE _mm256_zeroupper
#include "simd-common.h"

View File

@@ -0,0 +1,98 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* detection of alignment. This is complicated because a machine may
support multiple SIMD extensions (e.g. SSE2 and AVX) but only one
set of alignment contraints. So this alignment stuff cannot be
defined in the SIMD header files. Rather than defining a separate
set of "machine" header files, we just do this ugly ifdef here. */
#if defined(HAVE_SSE2) || defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX_128_FMA) || defined(HAVE_AVX512)
# if defined(FFTW_SINGLE)
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */
# else
# define ALIGNMENT 16 /* Alignment for the LD/ST macros */
# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */
# endif
#elif defined(HAVE_ALTIVEC)
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
# define ALIGNMENTA 16 /* Alignment for the LDA/STA macros */
#elif defined(HAVE_NEON) || defined(HAVE_VSX)
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
# define ALIGNMENTA 8 /* Alignment for the LDA/STA macros */
#elif defined(HAVE_KCVI)
# if defined(FFTW_SINGLE)
# define ALIGNMENT 8 /* Alignment for the LD/ST macros */
# else
# define ALIGNMENT 16 /* Alignment for the LD/ST macros */
# endif
# define ALIGNMENTA 64 /* Alignment for the LDA/STA macros */
#elif defined(HAVE_GENERIC_SIMD256)
# if defined(FFTW_SINGLE)
# define ALIGNMENT 8
# define ALIGNMENTA 32
# else
# define ALIGNMENT 16
# define ALIGNMENTA 32
# endif
#elif defined(HAVE_GENERIC_SIMD128)
# if defined(FFTW_SINGLE)
# define ALIGNMENT 8
# define ALIGNMENTA 16
# else
# define ALIGNMENT 16
# define ALIGNMENTA 16
# endif
#endif
#if HAVE_SIMD
# ifndef ALIGNMENT
# error "ALIGNMENT not defined"
# endif
# ifndef ALIGNMENTA
# error "ALIGNMENTA not defined"
# endif
#endif
/* rename for precision and for SIMD extensions */
#define XSIMD0(name, suffix) CONCAT(name, suffix)
#define XSIMD(name) XSIMD0(X(name), SIMD_SUFFIX)
#define XSIMD_STRING(x) x STRINGIZE(SIMD_SUFFIX)
/* TAINT_BIT is set if pointers are not guaranteed to be multiples of
ALIGNMENT */
#define TAINT_BIT 1
/* TAINT_BITA is set if pointers are not guaranteed to be multiples of
ALIGNMENTA */
#define TAINT_BITA 2
#define PTRINT(p) ((uintptr_t)(p))
#define ALIGNED(p) \
(((PTRINT(UNTAINT(p)) % ALIGNMENT) == 0) && !(PTRINT(p) & TAINT_BIT))
#define ALIGNEDA(p) \
(((PTRINT(UNTAINT(p)) % ALIGNMENTA) == 0) && !(PTRINT(p) & TAINT_BITA))
#define SIMD_STRIDE_OK(x) (!(((x) * sizeof(R)) % ALIGNMENT))
#define SIMD_STRIDE_OKA(x) (!(((x) * sizeof(R)) % ALIGNMENTA))
#define SIMD_VSTRIDE_OK SIMD_STRIDE_OK

View File

@@ -0,0 +1,288 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* Generic128d added by Romain Dolbeau, and turned into simd-generic128.h
* with single & double precision by Erik Lindahl.
* Romain Dolbeau hereby places his modifications in the public domain.
* Erik Lindahl hereby places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
# error "Generic simd128 only works in single or double precision"
#endif
#define SIMD_SUFFIX _generic_simd128 /* for renaming */
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define VDUPL(x) (V){x[0],x[0],x[2],x[2]}
# define VDUPH(x) (V){x[1],x[1],x[3],x[3]}
# define DVK(var, val) V var = {val,val,val,val}
#else
# define DS(d,s) d /* double-precision option */
# define VDUPL(x) (V){x[0],x[0]}
# define VDUPH(x) (V){x[1],x[1]}
# define DVK(var, val) V var = {val, val}
#endif
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
typedef DS(double,float) V __attribute__ ((vector_size(16)));
#define VADD(a,b) ((a)+(b))
#define VSUB(a,b) ((a)-(b))
#define VMUL(a,b) ((a)*(b))
#define LDK(x) x
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return *(const V *)x;
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(V *)x = v;
}
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
V res;
res[0] = x[0];
res[1] = x[1];
#ifdef FFTW_SINGLE
res[2] = x[ivs];
res[3] = x[ivs+1];
#endif
return res;
}
#ifdef FFTW_SINGLE
/* ST has to be separate due to the storage hack requiring reverse order */
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(x + ovs ) = v[2];
*(x + ovs + 1) = v[3];
*(x ) = v[0];
*(x + 1) = v[1];
}
#else
/* FFTW_DOUBLE */
# define ST STA
#endif
#ifdef FFTW_SINGLE
#define STM2 ST
#define STN2(x, v0, v1, ovs) /* nop */
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
{
*(x ) = v0[0];
*(x + 1) = v1[0];
*(x + 2) = v2[0];
*(x + 3) = v3[0];
*(x + ovs ) = v0[1];
*(x + ovs + 1) = v1[1];
*(x + ovs + 2) = v2[1];
*(x + ovs + 3) = v3[1];
*(x + 2 * ovs ) = v0[2];
*(x + 2 * ovs + 1) = v1[2];
*(x + 2 * ovs + 2) = v2[2];
*(x + 2 * ovs + 3) = v3[2];
*(x + 3 * ovs ) = v0[3];
*(x + 3 * ovs + 1) = v1[3];
*(x + 3 * ovs + 2) = v2[3];
*(x + 3 * ovs + 3) = v3[3];
}
#define STM4(x, v, ovs, aligned_like) /* no-op */
#else
/* FFTW_DOUBLE */
#define STM2 STA
#define STN2(x, v0, v1, ovs) /* nop */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
*(x) = v[0];
*(x+ovs) = v[1];
}
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#endif
static inline V FLIP_RI(V x)
{
#ifdef FFTW_SINGLE
return (V){x[1],x[0],x[3],x[2]};
#else
return (V){x[1],x[0]};
#endif
}
static inline V VCONJ(V x)
{
#ifdef FFTW_SINGLE
return (V){x[0],-x[1],x[2],-x[3]};
#else
return (V){x[0],-x[1]};
#endif
}
static inline V VBYI(V x)
{
x = VCONJ(x);
x = FLIP_RI(x);
return x;
}
/* FMA support */
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
#define VFMAI(b, c) VADD(c, VBYI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) \
{TW_CEXP, v, x}, {TW_CEXP, v+1, x}
static inline V BYTW1(const R *t, V sr)
{
return VZMUL(LDA(t, 2, t), sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
return VZMULJ(LDA(t, 2, t), sr);
}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}
static inline V BYTW1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMULJ(tx, sr);
}
#endif
#define TWVL1 (VL)
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#ifdef FFTW_SINGLE
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
# define TWVL3 (VL)
#else
# define VTW3(v,x) VTW1(v,x)
# define TWVL3 TWVL1
#endif
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,333 @@
/*
* Copyright (c) 2003, 2007-11 Matteo Frigo
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
*
* Generic256d added by Romain Dolbeau, and turned into simd-generic256.h
* with single & double precision by Erik Lindahl.
* Romain Dolbeau hereby places his modifications in the public domain.
* Erik Lindahl hereby places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
# error "Generic simd256 only works in single or double precision"
#endif
#define SIMD_SUFFIX _generic_simd256 /* for renaming */
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define VDUPL(x) {x[0],x[0],x[2],x[2],x[4],x[4],x[6],x[6]}
# define VDUPH(x) {x[1],x[1],x[3],x[3],x[5],x[5],x[7],x[7]}
# define DVK(var, val) V var = {val,val,val,val,val,val,val,val}
#else
# define DS(d,s) d /* double-precision option */
# define VDUPL(x) {x[0],x[0],x[2],x[2]}
# define VDUPH(x) {x[1],x[1],x[3],x[3]}
# define DVK(var, val) V var = {val, val, val, val}
#endif
#define VL DS(2,4) /* SIMD vector length, in term of complex numbers */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
typedef DS(double,float) V __attribute__ ((vector_size(32)));
#define VADD(a,b) ((a)+(b))
#define VSUB(a,b) ((a)-(b))
#define VMUL(a,b) ((a)*(b))
#define LDK(x) x
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
V var;
(void)aligned_like; /* UNUSED */
return *(const V *)x;
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(V *)x = v;
}
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
V var;
(void)aligned_like; /* UNUSED */
var[0] = x[0];
var[1] = x[1];
var[2] = x[ivs];
var[3] = x[ivs+1];
#ifdef FFTW_SINGLE
var[4] = x[2*ivs];
var[5] = x[2*ivs+1];
var[6] = x[3*ivs];
var[7] = x[3*ivs+1];
#endif
return var;
}
/* ST has to be separate due to the storage hack requiring reverse order */
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
#ifdef FFTW_SINGLE
*(x + 3*ovs ) = v[6];
*(x + 3*ovs + 1) = v[7];
*(x + 2*ovs ) = v[4];
*(x + 2*ovs + 1) = v[5];
*(x + ovs ) = v[2];
*(x + ovs + 1) = v[3];
*(x ) = v[0];
*(x + 1) = v[1];
#else
*(x + ovs ) = v[2];
*(x + ovs + 1) = v[3];
*(x ) = v[0];
*(x + 1) = v[1];
#endif
}
#ifdef FFTW_SINGLE
#define STM2(x, v, ovs, a) /* no-op */
static inline void STN2(R *x, V v0, V v1, INT ovs)
{
x[ 0] = v0[0];
x[ 1] = v0[1];
x[ 2] = v1[0];
x[ 3] = v1[1];
x[ ovs ] = v0[2];
x[ ovs + 1] = v0[3];
x[ ovs + 2] = v1[2];
x[ ovs + 3] = v1[3];
x[2*ovs ] = v0[4];
x[2*ovs + 1] = v0[5];
x[2*ovs + 2] = v1[4];
x[2*ovs + 3] = v1[5];
x[3*ovs ] = v0[6];
x[3*ovs + 1] = v0[7];
x[3*ovs + 2] = v1[6];
x[3*ovs + 3] = v1[7];
}
# define STM4(x, v, ovs, aligned_like) /* no-op */
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
{
*(x ) = v0[0];
*(x + 1) = v1[0];
*(x + 2) = v2[0];
*(x + 3) = v3[0];
*(x + ovs ) = v0[1];
*(x + ovs + 1) = v1[1];
*(x + ovs + 2) = v2[1];
*(x + ovs + 3) = v3[1];
*(x + 2 * ovs ) = v0[2];
*(x + 2 * ovs + 1) = v1[2];
*(x + 2 * ovs + 2) = v2[2];
*(x + 2 * ovs + 3) = v3[2];
*(x + 3 * ovs ) = v0[3];
*(x + 3 * ovs + 1) = v1[3];
*(x + 3 * ovs + 2) = v2[3];
*(x + 3 * ovs + 3) = v3[3];
*(x + 4 * ovs ) = v0[4];
*(x + 4 * ovs + 1) = v1[4];
*(x + 4 * ovs + 2) = v2[4];
*(x + 4 * ovs + 3) = v3[4];
*(x + 5 * ovs ) = v0[5];
*(x + 5 * ovs + 1) = v1[5];
*(x + 5 * ovs + 2) = v2[5];
*(x + 5 * ovs + 3) = v3[5];
*(x + 6 * ovs ) = v0[6];
*(x + 6 * ovs + 1) = v1[6];
*(x + 6 * ovs + 2) = v2[6];
*(x + 6 * ovs + 3) = v3[6];
*(x + 7 * ovs ) = v0[7];
*(x + 7 * ovs + 1) = v1[7];
*(x + 7 * ovs + 2) = v2[7];
*(x + 7 * ovs + 3) = v3[7];
}
#else
/* FFTW_DOUBLE */
#define STM2 ST
#define STN2(x, v0, v1, ovs) /* nop */
#define STM4(x, v, ovs, aligned_like) /* no-op */
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs) {
*(x ) = v0[0];
*(x + 1) = v1[0];
*(x + 2) = v2[0];
*(x + 3) = v3[0];
*(x + ovs ) = v0[1];
*(x + ovs + 1) = v1[1];
*(x + ovs + 2) = v2[1];
*(x + ovs + 3) = v3[1];
*(x + 2 * ovs ) = v0[2];
*(x + 2 * ovs + 1) = v1[2];
*(x + 2 * ovs + 2) = v2[2];
*(x + 2 * ovs + 3) = v3[2];
*(x + 3 * ovs ) = v0[3];
*(x + 3 * ovs + 1) = v1[3];
*(x + 3 * ovs + 2) = v2[3];
*(x + 3 * ovs + 3) = v3[3];
}
#endif
static inline V FLIP_RI(V x)
{
#ifdef FFTW_SINGLE
return (V){x[1],x[0],x[3],x[2],x[5],x[4],x[7],x[6]};
#else
return (V){x[1],x[0],x[3],x[2]};
#endif
}
static inline V VCONJ(V x)
{
#ifdef FFTW_SINGLE
return (x * (V){1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0});
#else
return (x * (V){1.0,-1.0,1.0,-1.0});
#endif
}
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
/* FMA support */
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
#define VFMAI(b, c) VADD(c, VBYI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
#else
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
#endif
#define TWVL1 (VL)
static inline V BYTW1(const R *t, V sr)
{
return VZMUL(LDA(t, 2, t), sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
return VZMULJ(LDA(t, 2, t), sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
#else
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#define VTW3 VTW1
#define TWVL3 TWVL1
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,461 @@
/*
* Copyright (c) 2003, 2007-11 Matteo Frigo
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
*
* Knights Corner Vector Instruction support added by Romain Dolbeau.
* Romain Dolbeau hereby places his modifications in the public domain.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "Knights Corner vector instructions only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## _ps
# define SCAL(x) x ## f
#else /* !FFTW_SINGLE */
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## _pd
# define SCAL(x) x
#endif /* FFTW_SINGLE */
#define SIMD_SUFFIX _kcvi /* for renaming */
#define VL DS(4, 8) /* SIMD complex vector length */
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
/* configuration ; KNF 0 0 0 1 0 1 */
#define KCVI_VBYI_SINGLE_USE_MUL 0
#define KCVI_VBYI_DOUBLE_USE_MUL 0
#define KCVI_LD_DOUBLE_USE_UNPACK 1
#define KCVI_ST_DOUBLE_USE_PACK 1
#define KCVI_ST2_DOUBLE_USE_STN2 0
#define KCVI_MULZ_USE_SWIZZLE 1
#include <immintrin.h>
typedef DS(__m512d, __m512) V;
#define VADD(a,b) SUFF(_mm512_add)(a,b)
#define VSUB(a,b) SUFF(_mm512_sub)(a,b)
#define VMUL(a,b) SUFF(_mm512_mul)(a,b)
#define VFMA(a, b, c) SUFF(_mm512_fmadd)(a, b, c) //VADD(c, VMUL(a, b))
#define VFMS(a, b, c) SUFF(_mm512_fmsub)(a, b, c) //VSUB(VMUL(a, b), c)
#define VFNMS(a, b, c) SUFF(_mm512_fnmadd)(a, b, c) //VSUB(c, VMUL(a, b))
#define LDK(x) x
#define VLIT(re, im) SUFF(_mm512_setr4)(im, re, im, re)
#define DVK(var, val) V var = SUFF(_mm512_set1)(val)
static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
return SUFF(_mm512_load)(x);
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
SUFF(_mm512_store)(x, v);
}
#if FFTW_SINGLE
#define VXOR(a,b) _mm512_xor_epi32(a,b)
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
6 * ivs + 1, 6 * ivs,
5 * ivs + 1, 5 * ivs,
4 * ivs + 1, 4 * ivs,
3 * ivs + 1, 3 * ivs,
2 * ivs + 1, 2 * ivs,
1 * ivs + 1, 1 * ivs,
0 * ivs + 1, 0 * ivs);
return _mm512_i32gather_ps(index, x, _MM_SCALE_4);
}
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
6 * ovs + 1, 6 * ovs,
5 * ovs + 1, 5 * ovs,
4 * ovs + 1, 4 * ovs,
3 * ovs + 1, 3 * ovs,
2 * ovs + 1, 2 * ovs,
1 * ovs + 1, 1 * ovs,
0 * ovs + 1, 0 * ovs);
_mm512_i32scatter_ps(x, index, v, _MM_SCALE_4);
}
static inline V FLIP_RI(V x)
{
return (V)_mm512_shuffle_epi32((__m512i)x, _MM_PERM_CDAB);
}
#define VDUPH(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_DDBB);
#define VDUPL(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_CCAA);
#else /* !FFTW_SINGLE */
#define VXOR(a,b) _mm512_xor_epi64(a,b)
#if defined (KCVI_LD_DOUBLE_USE_UNPACK) && KCVI_LD_DOUBLE_USE_UNPACK
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
V temp;
/* no need for hq here */
temp = _mm512_mask_loadunpacklo_pd(temp, 0x0003, x + (0 * ivs));
temp = _mm512_mask_loadunpacklo_pd(temp, 0x000c, x + (1 * ivs));
temp = _mm512_mask_loadunpacklo_pd(temp, 0x0030, x + (2 * ivs));
temp = _mm512_mask_loadunpacklo_pd(temp, 0x00c0, x + (3 * ivs));
return temp;
}
#else
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__declspec(align(64)) R temp[8];
int i;
for (i = 0 ; i < 4 ; i++) {
temp[i*2] = x[i * ivs];
temp[i*2+1] = x[i * ivs + 1];
}
return _mm512_load_pd(temp);
}
#endif
#if defined(KCVI_ST_DOUBLE_USE_PACK) && KCVI_ST_DOUBLE_USE_PACK
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
/* no need for hq here */
_mm512_mask_packstorelo_pd(x + (0 * ovs), 0x0003, v);
_mm512_mask_packstorelo_pd(x + (1 * ovs), 0x000c, v);
_mm512_mask_packstorelo_pd(x + (2 * ovs), 0x0030, v);
_mm512_mask_packstorelo_pd(x + (3 * ovs), 0x00c0, v);
}
#else
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__declspec(align(64)) R temp[8];
int i;
_mm512_store_pd(temp, v);
for (i = 0 ; i < 4 ; i++) {
x[i * ovs] = temp[i*2];
x[i * ovs + 1] = temp[i*2+1];
}
}
#endif
static inline V FLIP_RI(V x)
{
return (V)_mm512_shuffle_epi32((__m512i)x, _MM_PERM_BADC);
}
#define VDUPH(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_DCDC);
#define VDUPL(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_BABA);
#endif /* FFTW_SINGLE */
#define LD LDu
#define ST STu
#ifdef FFTW_SINGLE
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
#define STN2(x, v0, v1, ovs) /* nop */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(15 * ovs, 14 * ovs,
13 * ovs, 12 * ovs,
11 * ovs, 10 * ovs,
9 * ovs, 8 * ovs,
7 * ovs, 6 * ovs,
5 * ovs, 4 * ovs,
3 * ovs, 2 * ovs,
1 * ovs, 0 * ovs);
_mm512_i32scatter_ps(x, index, v, _MM_SCALE_4);
}
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
#else /* !FFTW_SINGLE */
#if defined(KCVI_ST2_DOUBLE_USE_STN2) && KCVI_ST2_DOUBLE_USE_STN2
#define STM2(x, v, ovs, a) /* no-op */
static inline void STN2(R *x, V v0, V v1, INT ovs) {
/* we start
AB CD EF GH -> *x (2 DBL), ovs between complex
IJ KL MN OP -> *(x+2) (2DBL), ovs between complex
and we want
ABIJ EFMN -> *x (4 DBL), 2 * ovs between complex pairs
CDKL GHOP -> *(x+ovs) (4DBL), 2 * ovs between complex pairs
*/
V x00 = (V)_mm512_mask_permute4f128_epi32((__m512i)v0, 0xF0F0, (__m512i)v1, _MM_PERM_CDAB);
V x01 = (V)_mm512_mask_permute4f128_epi32((__m512i)v1, 0x0F0F, (__m512i)v0, _MM_PERM_CDAB);
_mm512_mask_packstorelo_pd(x + (0 * ovs) + 0, 0x000F, x00);
/* _mm512_mask_packstorehi_pd(x + (0 * ovs) + 8, 0x000F, x00); */
_mm512_mask_packstorelo_pd(x + (2 * ovs) + 0, 0x00F0, x00);
/* _mm512_mask_packstorehi_pd(x + (2 * ovs) + 8, 0x00F0, x00); */
_mm512_mask_packstorelo_pd(x + (1 * ovs) + 0, 0x000F, x01);
/* _mm512_mask_packstorehi_pd(x + (1 * ovs) + 8, 0x000F, x01); */
_mm512_mask_packstorelo_pd(x + (3 * ovs) + 0, 0x00F0, x01);
/* _mm512_mask_packstorehi_pd(x + (3 * ovs) + 8, 0x00F0, x01); */
}
#else
#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
#define STN2(x, v0, v1, ovs) /* nop */
#endif
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
__m512i index = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0,
7 * ovs, 6 * ovs,
5 * ovs, 4 * ovs,
3 * ovs, 2 * ovs,
1 * ovs, 0 * ovs);
_mm512_i32loscatter_pd(x, index, v, _MM_SCALE_8);
}
#define STN4(x, v0, v1, v2, v3, ovs) /* no-op */
#endif /* FFTW_SINGLE */
static inline V VFMAI(V b, V c) {
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
return SUFF(_mm512_fmadd)(mpmp, SUFF(_mm512_swizzle)(b, _MM_SWIZ_REG_CDAB), c);
}
static inline V VFNMSI(V b, V c) {
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
return SUFF(_mm512_fnmadd)(mpmp, SUFF(_mm512_swizzle)(b, _MM_SWIZ_REG_CDAB), c);
}
static inline V VFMACONJ(V b, V c) {
V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
return SUFF(_mm512_fmadd)(pmpm, b, c);
}
static inline V VFMSCONJ(V b, V c) {
V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
return SUFF(_mm512_fmsub)(pmpm, b, c);
}
static inline V VFNMSCONJ(V b, V c) {
V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
return SUFF(_mm512_fnmadd)(pmpm, b, c);
}
static inline V VCONJ(V x)
{
V pmpm = VLIT(SCAL(-0.0), SCAL(0.0));
return (V)VXOR((__m512i)pmpm, (__m512i)x);
}
#ifdef FFTW_SINGLE
#if defined(KCVI_VBYI_SINGLE_USE_MUL) && KCVI_VBYI_SINGLE_USE_MUL
/* untested */
static inline V VBYI(V x)
{
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
return _mm512_mul_ps(mpmp, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
}
#else
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
#endif
#else /* !FFTW_SINGLE */
#if defined(KCVI_VBYI_DOUBLE_USE_MUL) && KCVI_VBYI_DOUBLE_USE_MUL
/* on KNF, using mul_pd is slower than shuf128x32 + xor */
static inline V VBYI(V x)
{
V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
return _mm512_mul_pd(mpmp, _mm512_swizzle_pd(x, _MM_SWIZ_REG_CDAB));
}
#else
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
#endif
#endif /* FFTW_SINGLE */
#if defined(KCVI_MULZ_USE_SWIZZLE) && KCVI_MULZ_USE_SWIZZLE
static inline V VZMUL(V tx, V sr) /* (a,b) (c,d) */
{
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
V ad = SUFF(_mm512_mul)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB)); /* (a*d,b*c) */
V acmbd = SUFF(_mm512_sub)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (a*c-b*d, b*d-a*c) */
V res = SUFF(_mm512_mask_add)(acmbd, DS(0x00aa,0xaaaa), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* ([a*c+b*c] a*c-b*d, b*c+a*d) */
return res;
}
static inline V VZMULJ(V tx, V sr) /* (a,b) (c,d) */
{
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
V ad = SUFF(_mm512_mul)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB)); /* (a*d,b*c) */
V acmbd = SUFF(_mm512_add)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (a*c+b*d, b*d+a*c) */
V res = SUFF(_mm512_mask_subr)(acmbd, DS(0x00aa,0xaaaa), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* ([a*c+b*c] a*c+b*d, a*d-b*c) */
return res;
}
static inline V VZMULI(V tx, V sr) /* (a,b) (c,d) */
{
DVK(zero, SCAL(0.0));
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
V ad = SUFF(_mm512_fnmadd)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB), zero); /* (-a*d,-b*c) */
V acmbd = SUFF(_mm512_subr)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (b*d-a*c, a*c-b*d) */
V res = SUFF(_mm512_mask_add)(acmbd, DS(0x0055,0x5555), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* (-a*d-b*c, a*c-b*d) */
return res;
}
static inline V VZMULIJ(V tx, V sr) /* (a,b) (c,d) */
{
DVK(zero, SCAL(0.0));
V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
V ad = SUFF(_mm512_fnmadd)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB), zero); /* (-a*d,-b*c) */
V acmbd = SUFF(_mm512_add)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (b*d+a*c, a*c+b*d) */
V res = SUFF(_mm512_mask_sub)(acmbd, DS(0x0055,0x5555), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* (-a*d+b*c, a*c-b*d) */
return res;
}
#else
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
#endif
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
#endif /* FFTW_SINGLE */
#define TWVL1 (VL)
static inline V BYTW1(const R *t, V sr)
{
return VZMUL(LDA(t, 2, t), sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
return VZMULJ(LDA(t, 2, t), sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+5, x}, \
{TW_COS, v+6, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, {TW_COS, v+7, x}, \
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
{TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v , x}, {TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
{TW_SIN, v , -x}, {TW_SIN, v , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
{TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
#endif /* FFTW_SINGLE */
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
/* V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#define VTW3(v,x) VTW1(v,x)
#define TWVL3 TWVL1
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v , x}, {TW_COS, v+1 , x}, {TW_COS, v+2 , x}, {TW_COS, v+3 , x}, \
{TW_COS, v+4 , x}, {TW_COS, v+5 , x}, {TW_COS, v+6 , x}, {TW_COS, v+7 , x}, \
{TW_COS, v+8 , x}, {TW_COS, v+9 , x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
{TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
{TW_SIN, v , x}, {TW_SIN, v+1 , x}, {TW_SIN, v+2 , x}, {TW_SIN, v+3 , x}, \
{TW_SIN, v+4 , x}, {TW_SIN, v+5 , x}, {TW_SIN, v+6 , x}, {TW_SIN, v+7 , x}, \
{TW_SIN, v+8 , x}, {TW_SIN, v+9 , x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
{TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}
#else /* !FFTW_SINGLE */
# define VTWS(v,x) \
{TW_COS, v , x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
{TW_SIN, v , x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
{TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
#endif /* FFTW_SINGLE */
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,335 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* Double-precision support added by Romain Dolbeau.
* Romain Dolbeau hereby places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if !defined(FFTW_SINGLE) && !defined( __aarch64__)
#error "NEON only works in single precision on 32 bits ARM"
#endif
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
#error "NEON only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## _f32
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## _f64
#endif
/* define these unconditionally, because they are used by
taint.c which is compiled without neon */
#define SIMD_SUFFIX _neon /* for renaming */
#define VL DS(1,2) /* SIMD complex vector length */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#if defined(__GNUC__) && !defined(__ARM_NEON__) && !defined(__ARM_NEON)
#error "compiling simd-neon.h requires -mfpu=neon or equivalent"
#endif
#include <arm_neon.h>
/* FIXME: I am not sure whether this code assumes little-endian
ordering. VLIT may or may not be wrong for big-endian systems. */
typedef DS(float64x2_t, float32x4_t) V;
#ifdef FFTW_SINGLE
# define VLIT(x0, x1) {x0, x1, x0, x1}
#else
# define VLIT(x0, x1) {x0, x1}
#endif
#define LDK(x) x
#define DVK(var, val) const V var = VLIT(val, val)
/* NEON has FMA, but a three-operand FMA is not too useful
for FFT purposes. We normally compute
t0=a+b*c
t1=a-b*c
In a three-operand instruction set this translates into
t0=a
t0+=b*c
t1=a
t1-=b*c
At least one move must be implemented, negating the advantage of
the FMA in the first place. At least some versions of gcc generate
both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
#if ARCH_PREFERS_FMA
#warning "--enable-fma on NEON is probably a bad idea (see source code)"
#endif
#define VADD(a, b) SUFF(vaddq)(a, b)
#define VSUB(a, b) SUFF(vsubq)(a, b)
#define VMUL(a, b) SUFF(vmulq)(a, b)
#define VFMA(a, b, c) SUFF(vmlaq)(c, a, b) /* a*b+c */
#define VFNMS(a, b, c) SUFF(vmlsq)(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
in ARM terminology */
#define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent
arm instruction (?) */
#define STOREH(a, v) SUFF(vst1)((a), SUFF(vget_high)(v))
#define STOREL(a, v) SUFF(vst1)((a), SUFF(vget_low)(v))
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void) aligned_like; /* UNUSED */
return SUFF(vld1q)(x);
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void) aligned_like; /* UNUSED */
SUFF(vst1q)(x, v);
}
#ifdef FFTW_SINGLE
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
(void) aligned_like; /* UNUSED */
return SUFF(vcombine)(SUFF(vld1)(x), SUFF(vld1)((x + ivs)));
}
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void) aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon store-low occurring
after store-high */
STOREH(x + ovs, v);
STOREL(x,v);
}
#else /* !FFTW_SINGLE */
# define LD LDA
# define ST STA
#endif
/* 2x2 complex transpose and store */
#define STM2 DS(STA,ST)
#define STN2(x, v0, v1, ovs) /* nop */
#ifdef FFTW_SINGLE
/* store and 4x4 real transpose */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void) aligned_like; /* UNUSED */
SUFF(vst1_lane)((x) , SUFF(vget_low)(v), 0);
SUFF(vst1_lane)((x + ovs), SUFF(vget_low)(v), 1);
SUFF(vst1_lane)((x + 2 * ovs), SUFF(vget_high)(v), 0);
SUFF(vst1_lane)((x + 3 * ovs), SUFF(vget_high)(v), 1);
}
#define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */
#else /* !FFTW_SINGLE */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
STOREL(x, v);
STOREH(x + ovs, v);
}
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#endif
#ifdef FFTW_SINGLE
#define FLIP_RI(x) SUFF(vrev64q)(x)
#else
/* FIXME */
#define FLIP_RI(x) SUFF(vcombine)(SUFF(vget_high)(x), SUFF(vget_low)(x))
#endif
static inline V VCONJ(V x)
{
#ifdef FFTW_SINGLE
static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
#else
static const uint64x2_t pm = {0, 0x8000000000000000ull};
/* Gcc-4.9.2 still does not include vreinterpretq_f64_u64, but simple
* casts generate the correct assembly.
*/
return (float64x2_t)(veorq_u64((uint64x2_t)(x), (uint64x2_t)(pm)));
#endif
}
static inline V VBYI(V x)
{
return FLIP_RI(VCONJ(x));
}
static inline V VFMAI(V b, V c)
{
const V mp = VLIT(-1.0, 1.0);
return VFMA(FLIP_RI(b), mp, c);
}
static inline V VFNMSI(V b, V c)
{
const V mp = VLIT(-1.0, 1.0);
return VFNMS(FLIP_RI(b), mp, c);
}
static inline V VFMACONJ(V b, V c)
{
const V pm = VLIT(1.0, -1.0);
return VFMA(b, pm, c);
}
static inline V VFNMSCONJ(V b, V c)
{
const V pm = VLIT(1.0, -1.0);
return VFNMS(b, pm, c);
}
static inline V VFMSCONJ(V b, V c)
{
return VSUB(VCONJ(b), c);
}
#ifdef FFTW_SINGLE
#if 1
#define VEXTRACT_REIM(tr, ti, tx) \
{ \
tr = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 0), \
SUFF(vdup_lane)(SUFF(vget_high)(tx), 0)); \
ti = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 1), \
SUFF(vdup_lane)(SUFF(vget_high)(tx), 1)); \
}
#else
/* this alternative might be faster in an ideal world, but gcc likes
to spill VVV onto the stack */
#define VEXTRACT_REIM(tr, ti, tx) \
{ \
float32x4x2_t vvv = SUFF(vtrnq)(tx, tx); \
tr = vvv.val[0]; \
ti = vvv.val[1]; \
}
#endif
#else
#define VEXTRACT_REIM(tr, ti, tx) \
{ \
tr = SUFF(vtrn1q)(tx, tx); \
ti = SUFF(vtrn2q)(tx, tx); \
}
#endif
static inline V VZMUL(V tx, V sr)
{
V tr, ti;
VEXTRACT_REIM(tr, ti, tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr, ti;
VEXTRACT_REIM(tr, ti, tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr, ti;
VEXTRACT_REIM(tr, ti, tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr, ti;
VEXTRACT_REIM(tr, ti, tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
#define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
#else
#define VTW1(v,x) {TW_CEXP, v, x}
#endif
#define TWVL1 VL
static inline V BYTW1(const R *t, V sr)
{
V tx = LDA(t, 2, 0);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LDA(t, 2, 0);
return VZMULJ(tx, sr);
}
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#else
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
V si = FLIP_RI(sr);
V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
return VFMA(ti, si, VMUL(tr, sr));
}
static inline V BYTWJ2(const R *t, V sr)
{
V si = FLIP_RI(sr);
V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#ifdef FFTW_SINGLE
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
#else
# define VTW3(v,x) {TW_CEXP, v, x}
#endif
# define TWVL3 (VL)
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,380 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
# error "SSE/SSE2 only works in single/double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## s
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## d
#endif
#define SIMD_SUFFIX _sse2 /* for renaming */
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#if defined(__GNUC__) && !defined(FFTW_SINGLE) && !defined(__SSE2__)
# error "compiling simd-sse2.h in double precision without -msse2"
#elif defined(__GNUC__) && defined(FFTW_SINGLE) && !defined(__SSE__)
# error "compiling simd-sse2.h in single precision without -msse"
#endif
#ifdef _MSC_VER
#ifndef inline
#define inline __inline
#endif
#endif
/* some versions of glibc's sys/cdefs.h define __inline to be empty,
which is wrong because emmintrin.h defines several inline
procedures */
#ifndef _MSC_VER
#undef __inline
#endif
#ifdef FFTW_SINGLE
# include <xmmintrin.h>
#else
# include <emmintrin.h>
#endif
typedef DS(__m128d,__m128) V;
#define VADD SUFF(_mm_add_p)
#define VSUB SUFF(_mm_sub_p)
#define VMUL SUFF(_mm_mul_p)
#define VXOR SUFF(_mm_xor_p)
#define SHUF SUFF(_mm_shuffle_p)
#define UNPCKL SUFF(_mm_unpacklo_p)
#define UNPCKH SUFF(_mm_unpackhi_p)
#define SHUFVALS(fp0,fp1,fp2,fp3) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define VDUPL(x) DS(UNPCKL(x, x), SHUF(x, x, SHUFVALS(0, 0, 2, 2)))
#define VDUPH(x) DS(UNPCKH(x, x), SHUF(x, x, SHUFVALS(1, 1, 3, 3)))
#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
#ifdef __GNUC__
/*
* gcc-3.3 generates slow code for mm_set_ps (write all elements to
* the stack and load __m128 from the stack).
*
* gcc-3.[34] generates slow code for mm_set_ps1 (load into low element
* and shuffle).
*
* This hack forces gcc to generate a constant __m128 at compile time.
*/
union rvec {
R r[DS(2,4)];
V v;
};
# ifdef FFTW_SINGLE
# define DVK(var, val) V var = __extension__ ({ \
static const union rvec _var = { {val,val,val,val} }; _var.v; })
# else
# define DVK(var, val) V var = __extension__ ({ \
static const union rvec _var = { {val,val} }; _var.v; })
# endif
# define LDK(x) x
#else
# define DVK(var, val) const R var = K(val)
# define LDK(x) DS(_mm_set1_pd,_mm_set_ps1)(x)
#endif
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ivs; /* UNUSED */
return *(const V *)x;
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
(void)ovs; /* UNUSED */
*(V *)x = v;
}
#ifdef FFTW_SINGLE
# ifdef _MSC_VER
/* Temporarily disable the warning "uninitialized local variable
'name' used" and runtime checks for using a variable before it is
defined which is erroneously triggered by the LOADL0 / LOADH macros
as they only modify VAL partly each. */
# ifndef __INTEL_COMPILER
# pragma warning(disable : 4700)
# pragma runtime_checks("u", off)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(disable : 592)
# endif
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
V var;
(void)aligned_like; /* UNUSED */
# ifdef __GNUC__
/* We use inline asm because gcc-3.x generates slow code for
_mm_loadh_pi(). gcc-3.x insists upon having an existing variable for
VAL, which is however never used. Thus, it generates code to move
values in and out the variable. Worse still, gcc-4.0 stores VAL on
the stack, causing valgrind to complain about uninitialized reads. */
__asm__("movlps %1, %0\n\tmovhps %2, %0"
: "=x"(var) : "m"(x[0]), "m"(x[ivs]));
# else
# define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
# define LOADL0(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
var = LOADL0(x, var);
var = LOADH(x + ivs, var);
# endif
return var;
}
# ifdef _MSC_VER
# ifndef __INTEL_COMPILER
# pragma warning(default : 4700)
# pragma runtime_checks("u", restore)
# endif
# endif
# ifdef __INTEL_COMPILER
# pragma warning(default : 592)
# endif
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
/* WARNING: the extra_iter hack depends upon STOREL occurring
after STOREH */
STOREH(x + ovs, v);
STOREL(x, v);
}
#else /* ! FFTW_SINGLE */
# define LD LDA
# define ST STA
#endif
#define STM2 DS(STA,ST)
#define STN2(x, v0, v1, ovs) /* nop */
#ifdef FFTW_SINGLE
# define STM4(x, v, ovs, aligned_like) /* no-op */
/* STN4 is a macro, not a function, thanks to Visual C++ developers
deciding "it would be infrequent that people would want to pass more
than 3 [__m128 parameters] by value." 3 parameters ought to be enough
for anybody. */
# define STN4(x, v0, v1, v2, v3, ovs) \
{ \
V xxx0, xxx1, xxx2, xxx3; \
xxx0 = UNPCKL(v0, v2); \
xxx1 = UNPCKH(v0, v2); \
xxx2 = UNPCKL(v1, v3); \
xxx3 = UNPCKH(v1, v3); \
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
}
#else /* !FFTW_SINGLE */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
STOREL(x, v);
STOREH(x + ovs, v);
}
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#endif
static inline V FLIP_RI(V x)
{
return SHUF(x, x, DS(1, SHUFVALS(1, 0, 3, 2)));
}
static inline V VCONJ(V x)
{
/* This will produce -0.0f (or -0.0d) even on broken
compilers that do not distinguish +0.0 from -0.0.
I bet some are still around. */
union uvec {
unsigned u[4];
V v;
};
/* it looks like gcc-3.3.5 produces slow code unless PM is
declared static. */
static const union uvec pm = {
#ifdef FFTW_SINGLE
{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
#else
{ 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
#endif
};
return VXOR(pm.v, x);
}
static inline V VBYI(V x)
{
x = VCONJ(x);
x = FLIP_RI(x);
return x;
}
/* FMA support */
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
#define VFMAI(b, c) VADD(c, VBYI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
static inline V BYTW1(const R *t, V sr)
{
const V *twp = (const V *)t;
V tx = twp[0];
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
tr = VMUL(tr, sr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V BYTWJ1(const R *t, V sr)
{
const V *twp = (const V *)t;
V tx = twp[0];
V tr = UNPCKL(tx, tx);
V ti = UNPCKH(tx, tx);
tr = VMUL(tr, sr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}
static inline V BYTW1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMULJ(tx, sr);
}
#endif
#define TWVL1 (VL)
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFMA(tr, sr, VMUL(ti, si));
}
static inline V BYTWJ2(const R *t, V sr)
{
const V *twp = (const V *)t;
V si = FLIP_RI(sr);
V tr = twp[0], ti = twp[1];
return VFNMS(ti, si, VMUL(tr, sr));
}
/* twiddle storage #3 */
#ifdef FFTW_SINGLE
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
# define TWVL3 (VL)
#else
# define VTW3(v,x) VTW1(v,x)
# define TWVL3 TWVL1
#endif
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,299 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* VSX SIMD implementation added 2015 Erik Lindahl.
* Erik Lindahl places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
# error "VSX only works in single or double precision"
#endif
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
# define SUFF(name) name ## s
#else
# define DS(d,s) d /* double-precision option */
# define SUFF(name) name ## d
#endif
#define SIMD_SUFFIX _vsx /* for renaming */
#define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
#include <altivec.h>
#include <stdio.h>
typedef DS(vector double,vector float) V;
#define VADD(a,b) vec_add(a,b)
#define VSUB(a,b) vec_sub(a,b)
#define VMUL(a,b) vec_mul(a,b)
#define VXOR(a,b) vec_xor(a,b)
#define UNPCKL(a,b) vec_mergel(a,b)
#define UNPCKH(a,b) vec_mergeh(a,b)
#ifdef FFTW_SINGLE
# define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}; vec_perm(a,a,perm); })
# define VDUPH(a) ({ const vector unsigned char perm = {4,5,6,7,4,5,6,7,12,13,14,15,12,13,14,15}; vec_perm(a,a,perm); })
#else
# define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}; vec_perm(a,a,perm); })
# define VDUPH(a) ({ const vector unsigned char perm = {8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15}; vec_perm(a,a,perm); })
#endif
static inline V LDK(R f) { return vec_splats(f); }
#define DVK(var, val) const R var = K(val)
static inline V VCONJ(V x)
{
const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
return vec_xor(x, pmpm);
}
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
{
#ifdef __ibmxl__
return vec_xl(0,(DS(double,float) *)x);
#else
return (*(const V *)(x));
#endif
}
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
{
#ifdef __ibmxl__
vec_xst(v,0,x);
#else
*(V *)x = v;
#endif
}
static inline V FLIP_RI(V x)
{
#ifdef FFTW_SINGLE
const vector unsigned char perm = { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 };
#else
const vector unsigned char perm = { 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 };
#endif
return vec_perm(x,x,perm);
}
#ifdef FFTW_SINGLE
static inline V LD(const R *x, INT ivs, const R *aligned_like)
{
const vector unsigned char perm = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
return vec_perm((vector float)vec_splats(*(double *)(x)),
(vector float)vec_splats(*(double *)(x+ivs)),perm);
}
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
{
*(double *)(x+ovs) = vec_extract( (vector double)v, 1 );
*(double *)x = vec_extract( (vector double)v, 0 );
}
#else
/* DOUBLE */
# define LD LDA
# define ST STA
#endif
#define STM2 DS(STA,ST)
#define STN2(x, v0, v1, ovs) /* nop */
#ifdef FFTW_SINGLE
# define STM4(x, v, ovs, aligned_like) /* no-op */
static inline void STN4(R *x, V v0, V v1, V v2, V v3, int ovs)
{
V xxx0, xxx1, xxx2, xxx3;
xxx0 = vec_mergeh(v0,v1);
xxx1 = vec_mergel(v0,v1);
xxx2 = vec_mergeh(v2,v3);
xxx3 = vec_mergel(v2,v3);
*(double *)x = vec_extract( (vector double)xxx0, 0 );
*(double *)(x+ovs) = vec_extract( (vector double)xxx0, 1 );
*(double *)(x+2*ovs) = vec_extract( (vector double)xxx1, 0 );
*(double *)(x+3*ovs) = vec_extract( (vector double)xxx1, 1 );
*(double *)(x+2) = vec_extract( (vector double)xxx2, 0 );
*(double *)(x+ovs+2) = vec_extract( (vector double)xxx2, 1 );
*(double *)(x+2*ovs+2) = vec_extract( (vector double)xxx3, 0 );
*(double *)(x+3*ovs+2) = vec_extract( (vector double)xxx3, 1 );
}
#else /* !FFTW_SINGLE */
static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
{
(void)aligned_like; /* UNUSED */
x[0] = vec_extract(v,0);
x[ovs] = vec_extract(v,1);
}
# define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
#endif
static inline V VBYI(V x)
{
/* FIXME [matteof 2017-09-21] It is possible to use vpermxor(),
but gcc and xlc treat the permutation bits differently, and
gcc-6 seems to generate incorrect code when using
__builtin_crypto_vpermxor() (i.e., VBYI() works for a small
test case but fails in the large).
Punt on vpermxor() for now and do the simple thing.
*/
return FLIP_RI(VCONJ(x));
}
/* FMA support */
#define VFMA(a, b, c) vec_madd(a,b,c)
#define VFNMS(a, b, c) vec_nmsub(a,b,c)
#define VFMS(a, b, c) vec_msub(a,b,c)
#define VFMAI(b, c) VADD(c, VBYI(b))
#define VFNMSI(b, c) VSUB(c, VBYI(b))
#define VFMACONJ(b,c) VADD(VCONJ(b),c)
#define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
static inline V VZMUL(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V VZMULJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
tr = VMUL(sr, tr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
static inline V VZMULI(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMS(tr, sr, ti);
}
static inline V VZMULIJ(V tx, V sr)
{
V tr = VDUPL(tx);
V ti = VDUPH(tx);
ti = VMUL(ti, sr);
sr = VBYI(sr);
return VFMA(tr, sr, ti);
}
/* twiddle storage #1: compact, slower */
#ifdef FFTW_SINGLE
# define VTW1(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
static inline V BYTW1(const R *t, V sr)
{
V tx = LDA(t,0,t);
V tr = UNPCKH(tx, tx);
V ti = UNPCKL(tx, tx);
tr = VMUL(tr, sr);
sr = VBYI(sr);
return VFMA(ti, sr, tr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LDA(t,0,t);
V tr = UNPCKH(tx, tx);
V ti = UNPCKL(tx, tx);
tr = VMUL(tr, sr);
sr = VBYI(sr);
return VFNMS(ti, sr, tr);
}
#else /* !FFTW_SINGLE */
# define VTW1(v,x) {TW_CEXP, v, x}
static inline V BYTW1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMUL(tx, sr);
}
static inline V BYTWJ1(const R *t, V sr)
{
V tx = LD(t, 1, t);
return VZMULJ(tx, sr);
}
#endif
#define TWVL1 (VL)
/* twiddle storage #2: twice the space, faster (when in cache) */
#ifdef FFTW_SINGLE
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
#else /* !FFTW_SINGLE */
# define VTW2(v,x) \
{TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
#endif
#define TWVL2 (2 * VL)
static inline V BYTW2(const R *t, V sr)
{
V si = FLIP_RI(sr);
V ti = LDA(t+2*VL,0,t);
V tt = VMUL(ti, si);
V tr = LDA(t,0,t);
return VFMA(tr, sr, tt);
}
static inline V BYTWJ2(const R *t, V sr)
{
V si = FLIP_RI(sr);
V tr = LDA(t,0,t);
V tt = VMUL(tr, sr);
V ti = LDA(t+2*VL,0,t);
return VFNMS(ti, si, tt);
}
/* twiddle storage #3 */
#ifdef FFTW_SINGLE
# define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
# define TWVL3 (VL)
#else
# define VTW3(v,x) VTW1(v,x)
# define TWVL3 TWVL1
#endif
/* twiddle storage for split arrays */
#ifdef FFTW_SINGLE
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
#else
# define VTWS(v,x) \
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
#endif
#define TWVLS (2 * VL)
#define VLEAVE() /* nothing */
#include "simd-common.h"

View File

@@ -0,0 +1,89 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#ifdef FFTW_SINGLE
# define DS(d,s) s /* single-precision option */
#else
# define DS(d,s) d /* double-precision option */
#endif
#if HAVE_SSE2
# if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
int X(have_simd_sse2)(void)
{
return 1;
}
# else /* !x86_64 */
# include <signal.h>
# include <setjmp.h>
# include "x86-cpuid.h"
static jmp_buf jb;
static void sighandler(int x)
{
UNUSED(x);
longjmp(jb, 1);
}
static int sse2_works(void)
{
void (*oldsig)(int);
oldsig = signal(SIGILL, sighandler);
if (setjmp(jb)) {
signal(SIGILL, oldsig);
return 0;
} else {
# ifdef _MSC_VER
_asm { DS(xorpd,xorps) xmm0,xmm0 }
# else
/* asm volatile ("xorpd/s %xmm0, %xmm0"); */
asm volatile(DS(".byte 0x66; .byte 0x0f; .byte 0x57; .byte 0xc0",
".byte 0x0f; .byte 0x57; .byte 0xc0"));
# endif
signal(SIGILL, oldsig);
return 1;
}
}
int X(have_simd_sse2)(void)
{
static int init = 0, res;
if (!init) {
res = !is_386()
&& has_cpuid()
&& (cpuid_edx(1) & (1 << DS(26,25)))
&& sse2_works();
init = 1;
}
return res;
}
# endif
#endif

View File

@@ -0,0 +1,43 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#include "simd-common.h"
#if HAVE_SIMD
R *X(taint)(R *p, INT s)
{
if (((unsigned)s * sizeof(R)) % ALIGNMENT)
p = (R *) (PTRINT(p) | TAINT_BIT);
if (((unsigned)s * sizeof(R)) % ALIGNMENTA)
p = (R *) (PTRINT(p) | TAINT_BITA);
return p;
}
/* join the taint of two pointers that are supposed to be
the same modulo the taint */
R *X(join_taint)(R *p1, R *p2)
{
A(UNTAINT(p1) == UNTAINT(p2));
return (R *)(PTRINT(p1) | PTRINT(p2));
}
#endif

View File

@@ -0,0 +1,69 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* VSX SIMD implementation added 2015 Erik Lindahl.
* Erik Lindahl places his modifications in the public domain.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "kernel/ifftw.h"
#if HAVE_VSX
#if HAVE_SYS_SYSCTL_H
# include <sys/sysctl.h>
#endif
#include <signal.h>
#include <setjmp.h>
static jmp_buf jb;
static void sighandler(int x)
{
longjmp(jb, 1);
}
static int really_have_vsx(void)
{
void (*oldsig)(int);
oldsig = signal(SIGILL, sighandler);
if (setjmp(jb)) {
signal(SIGILL, oldsig);
return 0;
} else {
float mem[2];
__asm__ __volatile__ ("stxsdx 0,0,%0" :: "r" (mem) : "memory" );
signal(SIGILL, oldsig);
return 1;
}
return 0;
}
int X(have_simd_vsx)(void)
{
static int init = 0, res;
if (!init) {
res = really_have_vsx();
init = 1;
}
return res;
}
#endif

View File

@@ -0,0 +1,212 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* this code was kindly donated by Eric J. Korpela */
#ifdef _MSC_VER
#include <intrin.h>
#ifndef inline
#define inline __inline
#endif
#endif
static inline int is_386()
{
#ifdef _MSC_VER
unsigned int result,tst;
_asm {
pushfd
pop eax
mov edx,eax
xor eax,40000h
push eax
popfd
pushfd
pop eax
push edx
popfd
mov tst,edx
mov result,eax
}
#else
register unsigned int result,tst;
__asm__ (
"pushfl\n\t"
"popl %0\n\t"
"movl %0,%1\n\t"
"xorl $0x40000,%0\n\t"
"pushl %0\n\t"
"popfl\n\t"
"pushfl\n\t"
"popl %0\n\t"
"pushl %1\n\t"
"popfl"
: "=r" (result), "=r" (tst) /* output */
: /* no inputs */
);
#endif
return (result == tst);
}
static inline int has_cpuid()
{
#ifdef _MSC_VER
unsigned int result,tst;
_asm {
pushfd
pop eax
mov edx,eax
xor eax,200000h
push eax
popfd
pushfd
pop eax
push edx
popfd
mov tst,edx
mov result,eax
}
#else
register unsigned int result,tst;
__asm__ (
"pushfl\n\t"
"pop %0\n\t"
"movl %0,%1\n\t"
"xorl $0x200000,%0\n\t"
"pushl %0\n\t"
"popfl\n\t"
"pushfl\n\t"
"popl %0\n\t"
"pushl %1\n\t"
"popfl"
: "=r" (result), "=r" (tst) /* output */
: /* no inputs */
);
#endif
return (result != tst);
}
/* cpuid version to get all registers. Donated by Erik Lindahl from Gromacs. */
static inline void
cpuid_all(int level, int ecxval, int *eax, int *ebx, int *ecx, int *edx)
{
#if (defined _MSC_VER)
int CPUInfo[4];
# if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
/* MSVC 9.0 SP1 or later */
__cpuidex(CPUInfo, level, ecxval);
# else
__cpuid(CPUInfo, level);
/* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
# endif
*eax = CPUInfo[0];
*ebx = CPUInfo[1];
*ecx = CPUInfo[2];
*edx = CPUInfo[3];
#else
/* Not MSVC */
*eax = level;
*ecx = ecxval;
*ebx = 0;
*edx = 0;
/* Avoid clobbering global offset table in 32-bit pic code (ebx) */
# if defined(__PIC__)
__asm__ ("xchgl %%ebx, %1 \n\t"
"cpuid \n\t"
"xchgl %%ebx, %1 \n\t"
: "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
# else
/* No need to save ebx if we are not in pic mode */
__asm__ ("cpuid \n\t"
: "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
# endif
#endif
}
static inline int cpuid_edx(int op)
{
# ifdef _MSC_VER
int result;
_asm {
push ebx
mov eax,op
cpuid
mov result,edx
pop ebx
}
return result;
# else
int eax, ecx, edx;
__asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
: "=a" (eax), "=c" (ecx), "=d" (edx)
: "a" (op));
return edx;
# endif
}
static inline int cpuid_ecx(int op)
{
# ifdef _MSC_VER
int result;
_asm {
push ebx
mov eax,op
cpuid
mov result,ecx
pop ebx
}
return result;
# else
int eax, ecx, edx;
__asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
: "=a" (eax), "=c" (ecx), "=d" (edx)
: "a" (op));
return ecx;
# endif
}
static inline int xgetbv_eax(int op)
{
# ifdef _MSC_VER
int veax, vedx;
_asm {
mov ecx,op
# if defined(__INTEL_COMPILER) || (_MSC_VER >= 1600)
xgetbv
# else
__emit 15
__emit 1
__emit 208
# endif
mov veax,eax
mov vedx,edx
}
return veax;
# else
int eax, edx;
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
return eax;
#endif
}