Updates

2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions
--- a/fftw-3.3.10/simd-support/Makefile.am
+++ b/fftw-3.3.10/simd-support/Makefile.am
@@ -0,0 +1,15 @@
+AM_CPPFLAGS = -I $(top_srcdir)
+noinst_LTLIBRARIES = libsimd_support.la 
+
+libsimd_support_la_SOURCES = taint.c simd-common.h \
+x86-cpuid.h amd64-cpuid.h \
+simd-sse2.h sse2.c \
+avx.c simd-avx.h \
+avx-128-fma.c simd-avx-128-fma.h \
+avx2.c simd-avx2.h simd-avx2-128.h \
+avx512.c simd-avx512.h \
+kcvi.c simd-kcvi.h \
+altivec.c simd-altivec.h vsx.c simd-vsx.h \
+neon.c simd-neon.h \
+simd-generic128.h simd-generic256.h
+
--- a/fftw-3.3.10/simd-support/Makefile.in
+++ b/fftw-3.3.10/simd-support/Makefile.in
@@ -0,0 +1,680 @@
+# Makefile.in generated by automake 1.16.3 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2020 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = simd-support
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libsimd_support_la_LIBADD =
+am_libsimd_support_la_OBJECTS = taint.lo sse2.lo avx.lo avx-128-fma.lo \
+	avx2.lo avx512.lo kcvi.lo altivec.lo vsx.lo neon.lo
+libsimd_support_la_OBJECTS = $(am_libsimd_support_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = ./$(DEPDIR)/altivec.Plo \
+	./$(DEPDIR)/avx-128-fma.Plo ./$(DEPDIR)/avx.Plo \
+	./$(DEPDIR)/avx2.Plo ./$(DEPDIR)/avx512.Plo \
+	./$(DEPDIR)/kcvi.Plo ./$(DEPDIR)/neon.Plo ./$(DEPDIR)/sse2.Plo \
+	./$(DEPDIR)/taint.Plo ./$(DEPDIR)/vsx.Plo
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libsimd_support_la_SOURCES)
+DIST_SOURCES = $(libsimd_support_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX2_CFLAGS = @AVX2_CFLAGS@
+AVX512_CFLAGS = @AVX512_CFLAGS@
+AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INDENT = @INDENT@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KCVI_CFLAGS = @KCVI_CFLAGS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+VSX_CFLAGS = @VSX_CFLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+runstatedir = @runstatedir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I $(top_srcdir)
+noinst_LTLIBRARIES = libsimd_support.la 
+libsimd_support_la_SOURCES = taint.c simd-common.h \
+x86-cpuid.h amd64-cpuid.h \
+simd-sse2.h sse2.c \
+avx.c simd-avx.h \
+avx-128-fma.c simd-avx-128-fma.h \
+avx2.c simd-avx2.h simd-avx2-128.h \
+avx512.c simd-avx512.h \
+kcvi.c simd-kcvi.h \
+altivec.c simd-altivec.h vsx.c simd-vsx.h \
+neon.c simd-neon.h \
+simd-generic128.h simd-generic256.h
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu simd-support/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu simd-support/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libsimd_support.la: $(libsimd_support_la_OBJECTS) $(libsimd_support_la_DEPENDENCIES) $(EXTRA_libsimd_support_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(LINK)  $(libsimd_support_la_OBJECTS) $(libsimd_support_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/altivec.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx-128-fma.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/avx512.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kcvi.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sse2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/taint.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vsx.Plo@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+		-rm -f ./$(DEPDIR)/altivec.Plo
+	-rm -f ./$(DEPDIR)/avx-128-fma.Plo
+	-rm -f ./$(DEPDIR)/avx.Plo
+	-rm -f ./$(DEPDIR)/avx2.Plo
+	-rm -f ./$(DEPDIR)/avx512.Plo
+	-rm -f ./$(DEPDIR)/kcvi.Plo
+	-rm -f ./$(DEPDIR)/neon.Plo
+	-rm -f ./$(DEPDIR)/sse2.Plo
+	-rm -f ./$(DEPDIR)/taint.Plo
+	-rm -f ./$(DEPDIR)/vsx.Plo
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+		-rm -f ./$(DEPDIR)/altivec.Plo
+	-rm -f ./$(DEPDIR)/avx-128-fma.Plo
+	-rm -f ./$(DEPDIR)/avx.Plo
+	-rm -f ./$(DEPDIR)/avx2.Plo
+	-rm -f ./$(DEPDIR)/avx512.Plo
+	-rm -f ./$(DEPDIR)/kcvi.Plo
+	-rm -f ./$(DEPDIR)/neon.Plo
+	-rm -f ./$(DEPDIR)/sse2.Plo
+	-rm -f ./$(DEPDIR)/taint.Plo
+	-rm -f ./$(DEPDIR)/vsx.Plo
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
+	clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	cscopelist-am ctags ctags-am distclean distclean-compile \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
--- a/fftw-3.3.10/simd-support/altivec.c
+++ b/fftw-3.3.10/simd-support/altivec.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_ALTIVEC
+
+#if HAVE_SYS_SYSCTL_H
+#  include <sys/sysctl.h>
+#endif
+
+#if HAVE_SYS_SYSCTL_H && HAVE_SYSCTL && defined(CTL_HW) && defined(HW_VECTORUNIT)
+/* code for darwin */
+static int really_have_altivec(void)
+{
+     int mib[2], altivecp;
+     size_t len;
+     mib[0] = CTL_HW;
+     mib[1] = HW_VECTORUNIT;
+     len = sizeof(altivecp);
+     sysctl(mib, 2, &altivecp, &len, NULL, 0);
+     return altivecp;
+} 
+#else /* GNU/Linux and other non-Darwin systems (!HAVE_SYS_SYSCTL_H etc.) */
+
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jb;
+
+static void sighandler(int x)
+{
+     longjmp(jb, 1);
+}
+
+static int really_have_altivec(void)
+{
+     void (*oldsig)(int);
+     oldsig = signal(SIGILL, sighandler);
+     if (setjmp(jb)) {
+	  signal(SIGILL, oldsig);
+	  return 0;
+     } else {
+	  __asm__ __volatile__ (".long 0x10000484"); /* vor 0,0,0 */
+	  signal(SIGILL, oldsig);
+	  return 1;
+     }
+     return 0;
+}
+#endif
+
+int X(have_simd_altivec)(void)
+{
+     static int init = 0, res;
+     if (!init) {
+	  res = really_have_altivec();
+	  init = 1;
+     }
+     return res;
+}
+
+#endif
--- a/fftw-3.3.10/simd-support/amd64-cpuid.h
+++ b/fftw-3.3.10/simd-support/amd64-cpuid.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (_MSC_VER >= 1600) && !defined(__INTEL_COMPILER)
+#include <immintrin.h>
+#endif
+#endif
+
+/* cpuid version to get all registers. Donated by Erik Lindahl from Gromacs. */
+static inline void
+cpuid_all(int level, int ecxval, int *eax, int *ebx, int *ecx, int *edx)
+{
+#    ifdef _MSC_VER
+    int CPUInfo[4];
+
+#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
+    /* MSVC 9.0 SP1 or later */
+    __cpuidex(CPUInfo, level, ecxval);
+#else
+    __cpuid(CPUInfo, level);
+#endif
+    *eax = CPUInfo[0];
+    *ebx = CPUInfo[1];
+    *ecx = CPUInfo[2];
+    *edx = CPUInfo[3];
+    
+#    else
+    /* Not MSVC */
+    *eax = level;
+    *ecx = ecxval;
+    *ebx = 0;
+    *edx = 0;
+    /* No need to save ebx if we are not in pic mode */
+    __asm__ ("cpuid            \n\t"
+             : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
+#    endif
+}
+
+
+static inline int cpuid_ecx(int op)
+{
+#    ifdef _MSC_VER
+#    ifdef __INTEL_COMPILER
+     int result;
+     _asm {
+	  push rbx
+          mov eax,op
+          cpuid
+          mov result,ecx
+          pop rbx
+     }
+     return result;
+#    else
+     int cpu_info[4];
+     __cpuid(cpu_info,op);
+     return cpu_info[2];
+#    endif
+#    else
+     int eax, ecx = 0, edx;
+
+     __asm__("pushq %%rbx\n\tcpuid\n\tpopq %%rbx"
+	     : "=a" (eax), "+c" (ecx), "=d" (edx)
+	     : "a" (op));
+     return ecx;
+#    endif
+}
+
+static inline int cpuid_ebx(int op)
+{
+#    ifdef _MSC_VER
+#    ifdef __INTEL_COMPILER
+     int result;
+     _asm {
+          push rbx
+          mov eax,op
+          cpuid
+          mov result,ebx
+          pop rbx
+     }
+     return result;
+#    else
+     int cpu_info[4];
+     __cpuid(cpu_info,op);
+     return cpu_info[1];
+#    endif
+#    else
+     int eax, ecx = 0, edx;
+
+     __asm__("pushq %%rbx\n\tcpuid\nmov %%ebx,%%ecx\n\tpopq %%rbx"
+             : "=a" (eax), "+c" (ecx), "=d" (edx)
+             : "a" (op));
+     return ecx;
+#    endif
+}
+
+static inline int xgetbv_eax(int op)
+{
+#    ifdef _MSC_VER
+#    ifdef __INTEL_COMPILER
+     int veax, vedx;
+     _asm {
+          mov ecx,op
+          xgetbv
+          mov veax,eax
+          mov vedx,edx
+     }
+     return veax;
+#    else
+#    if defined(_MSC_VER) && (_MSC_VER >= 1600)
+     unsigned __int64 result;
+     result = _xgetbv(op);
+     return (int)result;
+#    else
+#    error "Need at least Visual Studio 10 SP1 for AVX support"
+#    endif
+#    endif
+#    else
+     int eax, edx;
+     __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
+     return eax;
+#endif
+}
--- a/fftw-3.3.10/simd-support/avx-128-fma.c
+++ b/fftw-3.3.10/simd-support/avx-128-fma.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_AVX_128_FMA
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#    include "amd64-cpuid.h"
+#else
+#    include "x86-cpuid.h"
+#endif
+
+int X(have_simd_avx_128_fma)(void)
+{
+       static int init = 0, res = 0;
+       int eax,ebx,ecx,edx;
+
+       if (!init)
+       {
+	 /* Check if this is an AMD CPU */
+	 cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
+
+	 /* 0x68747541: "Auth"  , 0x444d4163: "enti"  , 0x69746e65: "cAMD" */
+	 if (ebx==0x68747541 && ecx==0x444d4163 && edx==0x69746e65)
+	 {
+	   /* OK, this is an AMD CPU. Check if we support FMA4 */
+	   cpuid_all(0x80000001,0,&eax,&ebx,&ecx,&edx);
+	   if(ecx & (1<<16))
+	     {
+	       res = 1;
+	     }
+	 }
+	 init = 1;
+       }
+       return res;
+}
+
+#endif
--- a/fftw-3.3.10/simd-support/avx.c
+++ b/fftw-3.3.10/simd-support/avx.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_AVX
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#    include "amd64-cpuid.h"
+#else
+#    include "x86-cpuid.h"
+#endif
+
+int X(have_simd_avx)(void)
+{
+     static int init = 0, res = 0;
+     int max_stdfn, eax, ebx, ecx, edx;
+
+     if (!init) {
+          cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
+          max_stdfn = eax;
+          if (max_stdfn >= 0x1) {
+               /* have AVX and OSXSAVE? (implies XGETBV exists) */
+               cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
+               if ((ecx & 0x18000000) == 0x18000000) {
+                    /* have OS support for XMM, YMM? */
+                    res = ((xgetbv_eax(0) & 0x6) == 0x6);                    
+               }
+          }
+          init = 1;
+     }
+     return res;
+}
+
+#endif
+
--- a/fftw-3.3.10/simd-support/avx2.c
+++ b/fftw-3.3.10/simd-support/avx2.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_AVX2
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#    include "amd64-cpuid.h"
+#else
+#    include "x86-cpuid.h"
+#endif
+
+int X(have_simd_avx2_128)(void)
+{
+    static int init = 0, res;
+    int max_stdfn, eax, ebx, ecx, edx;
+    
+    if (!init) {
+         cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
+         max_stdfn = eax;
+         if (max_stdfn >= 0x1) {
+              /* have AVX and OSXSAVE? (implies XGETBV exists) */
+              cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
+              if ((ecx & 0x18000000) == 0x18000000) {              
+                   /* have AVX2? */
+                   cpuid_all(7,0,&eax,&ebx,&ecx,&edx);
+                   if (ebx & (1 << 5)) {
+                        /* have OS support for XMM, YMM? */
+                        res = ((xgetbv_eax(0) & 0x6) == 0x6);
+                   }
+              }
+        }
+        init = 1;
+    }
+    return res;
+}
+
+int X(have_simd_avx2)(void)
+{
+  /*
+   * For now 256-bit AVX2 support is identical to 128-bit.
+   * This might change in the future if AMD released AVX2-capable
+   * chips that work better with the 128-bit flavor, but since AMD
+   * might actually change it to implement 256-bit AVX2 efficiently
+   * by then we don't want to disable it before we know.
+   */
+  return X(have_simd_avx2_128)();
+}
+#endif
--- a/fftw-3.3.10/simd-support/avx512.c
+++ b/fftw-3.3.10/simd-support/avx512.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ * Copyright (c) 2012-2013 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#include "kernel/ifftw.h"
+
+#if HAVE_AVX512
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+
+#include "amd64-cpuid.h"
+
+int X(have_simd_avx512)(void)
+{
+     static int init = 0, res;     
+     int max_stdfn, eax, ebx, ecx, edx;
+
+     /* NOTE: this code is a total guess.  I don't have an avx512
+        machine available.  The code contributed by Erik Lindahl would
+        crash on a machine without XGETBV, so I had to guess a fix. */
+     if (!init) {
+          cpuid_all(0,0,&eax,&ebx,&ecx,&edx);
+          max_stdfn = eax;
+          if (max_stdfn >= 0x1) {
+               /* have OSXSAVE? (implies XGETBV exists) */
+               cpuid_all(0x1, 0, &eax, &ebx, &ecx, &edx);
+               if ((ecx & 0x08000000) == 0x08000000) {
+                    /* have AVX512? */
+                    cpuid_all(7,0,&eax,&ebx,&ecx,&edx);
+                    if (ebx & (1 << 16)) {
+                         /* have OS support for XMM, YMM, ZMM */
+                         int zmm_ymm_xmm = (7 << 5) | (1 << 2) | (1 << 1);
+                         res = ((xgetbv_eax(0) & zmm_ymm_xmm) == zmm_ymm_xmm);
+                    }
+               }
+          }
+          init = 1;
+     }
+
+     return res;
+}
+
+#else /* 32-bit code */
+
+#error "Avx512 is 64 bits only"
+
+#endif
+
+#endif
--- a/fftw-3.3.10/simd-support/kcvi.c
+++ b/fftw-3.3.10/simd-support/kcvi.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ * Copyright (c) 2012-2013 Romain Dolbeau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#include "kernel/ifftw.h"
+
+#if HAVE_KCVI
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+
+#include "amd64-cpuid.h"
+
+int X(have_simd_kcvi)(void)
+{
+       static int init = 0, res;
+       if (!init) {
+         res = 1;
+         init = 1;
+       }
+       return res;
+}
+
+#else /* 32-bit code */
+
+#error "KCvi is 64 bits only"
+
+#endif
+
+#endif
--- a/fftw-3.3.10/simd-support/neon.c
+++ b/fftw-3.3.10/simd-support/neon.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_NEON
+
+/* check for an environment where signals are known to work */
+#if defined(unix) || defined(linux)
+  # include <signal.h>
+  # include <setjmp.h>
+
+  static jmp_buf jb;
+
+  static void sighandler(int x)
+  {
+       UNUSED(x);
+       longjmp(jb, 1);
+  }
+
+  static int really_have_neon(void)
+  {
+       void (*oldsig)(int);
+       oldsig = signal(SIGILL, sighandler);
+       if (setjmp(jb)) {
+	    signal(SIGILL, oldsig);
+	    return 0;
+       } else {
+	    /* paranoia: encode the instruction in binary because the
+	       assembler may not recognize it without -mfpu=neon */
+	    /*asm volatile ("vand q0, q0, q0");*/
+	    asm volatile (".long 0xf2000150");
+	    signal(SIGILL, oldsig);
+	    return 1;
+       }
+  }
+
+  int X(have_simd_neon)(void)
+  {
+       static int init = 0, res;
+
+       if (!init) {
+	    res = really_have_neon();
+	    init = 1;
+       }
+       return res;
+  }
+
+
+#else
+/* don't know how to autodetect NEON; assume it is present */
+  int X(have_simd_neon)(void)
+  {
+       return 1;
+  }
+#endif
+
+#endif
--- a/fftw-3.3.10/simd-support/simd-altivec.h
+++ b/fftw-3.3.10/simd-support/simd-altivec.h
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef FFTW_SINGLE
+#error "ALTIVEC only works in single precision"
+#endif
+
+/* define these unconditionally, because they are used by
+   taint.c which is compiled without altivec */
+#define SIMD_SUFFIX _altivec  /* for renaming */
+#define VL 2            /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA
+
+#if !defined(__VEC__) && !defined(FAKE__VEC__)
+#  error "compiling simd-altivec.h requires -maltivec or equivalent"
+#endif
+
+#ifdef HAVE_ALTIVEC_H
+#  include <altivec.h>
+#endif
+
+typedef vector float V;
+#define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
+#define LDK(x) x
+#define DVK(var, val) const V var = VLIT(val, val, val, val)
+
+static inline V VADD(V a, V b) { return vec_add(a, b); }
+static inline V VSUB(V a, V b) { return vec_sub(a, b); }
+static inline V VFMA(V a, V b, V c) { return vec_madd(a, b, c); }
+static inline V VFNMS(V a, V b, V c) { return vec_nmsub(a, b, c); }
+
+static inline V VMUL(V a, V b)
+{
+     DVK(zero, -0.0);
+     return VFMA(a, b, zero);
+}
+
+static inline V VFMS(V a, V b, V c) { return VSUB(VMUL(a, b), c); }
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like) 
+{
+     UNUSED(ivs);
+     UNUSED(aligned_like);
+     return vec_ld(0, x);
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like) 
+{
+     /* common subexpressions */
+     const INT fivs = sizeof(R) * ivs;
+       /* you are not expected to understand this: */
+     const vector unsigned int perm = VLIT(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
+     vector unsigned char ml = vec_lvsr(fivs + 8, aligned_like);
+     vector unsigned char mh = vec_lvsl(0, aligned_like);
+     vector unsigned char msk = 
+	  (vector unsigned char)vec_sel((V)mh, (V)ml, perm);
+     /* end of common subexpressions */
+
+     return vec_perm(vec_ld(0, x), vec_ld(fivs, x), msk);
+}
+
+/* store lower half */
+static inline void STH(R *x, V v, R *aligned_like)
+{
+     v = vec_perm(v, v, vec_lvsr(0, aligned_like));
+     vec_ste(v, 0, x);
+     vec_ste(v, sizeof(R), x);
+}
+
+static inline void STL(R *x, V v, INT ovs, R *aligned_like)
+{
+     const INT fovs = sizeof(R) * ovs;
+     v = vec_perm(v, v, vec_lvsr(fovs + 8, aligned_like));
+     vec_ste(v, fovs, x);
+     vec_ste(v, sizeof(R) + fovs, x);
+}
+
+static inline void STA(R *x, V v, INT ovs, R *aligned_like) 
+{
+     UNUSED(ovs);
+     UNUSED(aligned_like);
+     vec_st(v, 0, x);
+}
+
+static inline void ST(R *x, V v, INT ovs, R *aligned_like) 
+{
+     /* WARNING: the extra_iter hack depends upon STH occurring after
+	STL */
+     STL(x, v, ovs, aligned_like);
+     STH(x, v, aligned_like);
+}
+
+#define STM2(x, v, ovs, aligned_like) /* no-op */
+
+static inline void STN2(R *x, V v0, V v1, INT ovs)
+{
+     const INT fovs = sizeof(R) * ovs;
+     const vector unsigned int even = 
+	  VLIT(0x00010203, 0x04050607, 0x10111213, 0x14151617);
+     const vector unsigned int odd = 
+	  VLIT(0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f);
+     vec_st(vec_perm(v0, v1, (vector unsigned char)even), 0, x);
+     vec_st(vec_perm(v0, v1, (vector unsigned char)odd), fovs, x);
+}
+
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
+{
+     const INT fovs = sizeof(R) * ovs;
+     V x0 = vec_mergeh(v0, v2);
+     V x1 = vec_mergel(v0, v2);
+     V x2 = vec_mergeh(v1, v3);
+     V x3 = vec_mergel(v1, v3);
+     V y0 = vec_mergeh(x0, x2);
+     V y1 = vec_mergel(x0, x2);
+     V y2 = vec_mergeh(x1, x3);
+     V y3 = vec_mergel(x1, x3);
+     vec_st(y0, 0, x);
+     vec_st(y1, fovs, x);
+     vec_st(y2, 2 * fovs, x);
+     vec_st(y3, 3 * fovs, x);
+}
+
+static inline V FLIP_RI(V x)
+{
+     const vector unsigned int perm = 
+	  VLIT(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
+     return vec_perm(x, x, (vector unsigned char)perm);
+}
+
+static inline V VCONJ(V x)
+{
+     const V pmpm = VLIT(0.0, -0.0, 0.0, -0.0);
+     return vec_xor(x, pmpm);
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+static inline V VFMAI(V b, V c)
+{
+     const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
+     return VFMA(FLIP_RI(b), mpmp, c);
+}
+
+static inline V VFNMSI(V b, V c)
+{
+     const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
+     return VFNMS(FLIP_RI(b), mpmp, c);
+}
+
+static inline V VFMACONJ(V b, V c)
+{
+     const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VFMA(b, pmpm, c);
+}
+
+static inline V VFNMSCONJ(V b, V c)
+{
+     const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
+     return VFNMS(b, pmpm, c);
+}
+
+static inline V VFMSCONJ(V b, V c)
+{
+     return VSUB(VCONJ(b), c);
+}
+
+static inline V VZMUL(V tx, V sr)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V si = VBYI(sr);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V si = VBYI(sr);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+static inline V VZMULI(V tx, V si)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V sr = VBYI(si);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+static inline V VZMULIJ(V tx, V si)
+{
+     const vector unsigned int real = 
+	  VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
+     const vector unsigned int imag = 
+	  VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
+     V sr = VBYI(si);
+     V tr = vec_perm(tx, tx, (vector unsigned char)real);
+     V ti = vec_perm(tx, tx, (vector unsigned char)imag);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #1: compact, slower */
+#define VTW1(v,x) \
+ {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = VBYI(sr);
+     V tx = twp[0];
+     V tr = vec_mergeh(tx, tx);
+     V ti = vec_mergel(tx, tx);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = VBYI(sr);
+     V tx = twp[0];
+     V tr = vec_mergeh(tx, tx);
+     V ti = vec_mergel(tx, tx);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#define TWVL3 (VL)
+
+/* twiddle storage for split arrays */
+#define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-avx-128-fma.h
+++ b/fftw-3.3.10/simd-support/simd-avx-128-fma.h
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * 128-bit AVX support by Erik Lindahl, 2015.
+ * Erik Lindahl hereby places his modifications in the public domain.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "AVX only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _avx_128_fma  /* for renaming */
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#    include <intrin.h>
+#elif defined (__GNUC__)
+#    include <x86intrin.h>
+#endif
+
+#if !(defined(__AVX__) && defined(__FMA4__)) /* sanity check */
+#error "compiling simd-avx-128-fma.h without -mavx or -mfma4"
+#endif
+
+typedef DS(__m128d,__m128) V;
+#define VADD SUFF(_mm_add_p)
+#define VSUB SUFF(_mm_sub_p)
+#define VMUL SUFF(_mm_mul_p)
+#define VXOR SUFF(_mm_xor_p)
+#define SHUF SUFF(_mm_shuffle_p)
+#define VPERM1 SUFF(_mm_permute_p)
+#define UNPCKL SUFF(_mm_unpacklo_p)
+#define UNPCKH SUFF(_mm_unpackhi_p)
+
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))
+#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))
+#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
+#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
+
+#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))
+#define DVK(var, val) V var = VLIT(val, val)
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     *(V *)x = v;
+}
+
+#ifdef FFTW_SINGLE
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+    V var;
+#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
+    var = LOADL(x, SUFF(_mm_undefined_p)());
+    var = LOADH(x + ivs, var);
+#else
+    var = LOADL(x, var);
+    var = LOADH(x + ivs, var);
+#endif
+    return var;
+}
+
+#  ifdef _MSC_VER
+#    pragma warning(default : 4700)
+#    pragma runtime_checks("u", restore)
+#  endif
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + ovs, v);
+     STOREL(x, v);
+}
+
+#else /* ! FFTW_SINGLE */
+#  define LD LDA
+#  define ST STA
+#endif
+
+#define STM2 DS(STA,ST)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#ifdef FFTW_SINGLE
+#  define STM4(x, v, ovs, aligned_like) /* no-op */
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  3 parameters ought to be enough
+   for anybody. */
+#  define STN4(x, v0, v1, v2, v3, ovs)			\
+{							\
+     V xxx0, xxx1, xxx2, xxx3;				\
+     xxx0 = UNPCKL(v0, v2);				\
+     xxx1 = UNPCKH(v0, v2);				\
+     xxx2 = UNPCKL(v1, v3);				\
+     xxx3 = UNPCKH(v1, v3);				\
+     STA(x, UNPCKL(xxx0, xxx2), 0, 0);			\
+     STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0);		\
+     STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0);	\
+     STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0);	\
+}
+#else /* !FFTW_SINGLE */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     STOREL(x, v);
+     STOREH(x + ovs, v);
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+static inline V FLIP_RI(V x)
+{
+  return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));
+}
+
+
+static inline V VCONJ(V x)
+{
+     /* Produce a SIMD vector[VL] of (0 + -0i). 
+
+        We really want to write this:
+
+           V pmpm = VLIT(-0.0, 0.0);
+
+        but historically some compilers have ignored the distiction
+        between +0 and -0.  It looks like 'gcc-8 -fast-math' treats -0
+        as 0 too.
+      */
+     union uvec {
+          unsigned u[4];
+          V v;
+     };
+     static const union uvec pmpm = {
+#ifdef FFTW_SINGLE
+          { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
+#else
+          { 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
+#endif
+     };
+     return VXOR(pmpm.v, x);
+}
+
+static inline V VBYI(V x)
+{
+     x = VCONJ(x);
+     x = FLIP_RI(x);
+     return x;
+}
+
+/* FMA support */
+#define VFMA(a, b, c)  SUFF(_mm_macc_p)(a,b,c)
+#define VFNMS(a, b, c) SUFF(_mm_nmacc_p)(a,b,c)
+#define VFMS(a, b, c)  SUFF(_mm_msub_p)(a,b,c)
+#define VFMAI(b, c)  SUFF(_mm_addsub_p)(c,FLIP_RI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)
+
+static inline V VZMUL(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    tr = VMUL(tr, sr);
+    ti = VMUL(ti, FLIP_RI(sr));
+    return SUFF(_mm_addsub_p)(tr,ti);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    ti = VMUL(ti, sr);
+    tr = VMUL(tr, FLIP_RI(sr));
+    return SUFF(_mm_addsub_p)(ti,tr);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+static inline V BYTW1(const R *t, V sr)
+{
+    const V *twp = (const V *)t;
+    V tx = twp[0];
+    V tr = UNPCKL(tx, tx);
+    V ti = UNPCKH(tx, tx);
+    tr = VMUL(tr, sr);
+    ti = VMUL(ti, FLIP_RI(sr));
+    return SUFF(_mm_addsub_p)(tr,ti);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+    const V *twp = (const V *)t;
+    V tx = twp[0];
+    V tr = UNPCKL(tx, tx);
+    V ti = UNPCKH(tx, tx);
+    tr = VMUL(tr, sr);
+    sr = VBYI(sr);
+    return VFNMS(ti, sr, tr);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-avx.h
+++ b/fftw-3.3.10/simd-support/simd-avx.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "AVX only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _avx  /* for renaming */
+#define VL DS(2, 4)        /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2) 
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__AVX__) /* sanity check */
+#error "compiling simd-avx.h without -mavx"
+#endif
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#include <immintrin.h>
+
+typedef DS(__m256d, __m256) V;
+#define VADD SUFF(_mm256_add_p)
+#define VSUB SUFF(_mm256_sub_p)
+#define VMUL SUFF(_mm256_mul_p)
+#define VXOR SUFF(_mm256_xor_p)
+#define VSHUF SUFF(_mm256_shuffle_p)
+
+#define SHUFVALD(fp0,fp1) \
+   (((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(_mm256_unpacklo_pd(x, x), VSHUF(x, x, SHUFVALS(0, 0, 2, 2)))
+#define VDUPH(x) DS(_mm256_unpackhi_pd(x, x), VSHUF(x, x, SHUFVALS(1, 1, 3, 3)))
+
+#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
+#define DVK(var, val) V var = VLIT(val, val)
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return SUFF(_mm256_loadu_p)(x);
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     SUFF(_mm256_storeu_p)(x, v);
+}
+
+#if FFTW_SINGLE
+
+#  ifdef _MSC_VER
+     /* Temporarily disable the warning "uninitialized local variable
+	'name' used" and runtime checks for using a variable before it is
+	defined which is erroneously triggered by the LOADL0 / LOADH macros
+	as they only modify VAL partly each. */
+#    ifndef __INTEL_COMPILER
+#      pragma warning(disable : 4700)
+#      pragma runtime_checks("u", off)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(disable : 592)
+#  endif
+
+#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
+#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
+
+/* it seems like the only AVX way to store 4 complex floats is to
+   extract two pairs of complex floats into two __m128 registers, and
+   then use SSE-like half-stores.  Similarly, to load 4 complex
+   floats, we load two pairs of complex floats into two __m128
+   registers, and then pack the two __m128 registers into one __m256
+   value. */
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     __m128 l, h;
+     V v;
+     (void)aligned_like; /* UNUSED */
+     l = LOADL(x, l);
+     l = LOADH(x + ivs, l);
+     h = LOADL(x + 2*ivs, h);
+     h = LOADH(x + 3*ivs, h);
+     v = _mm256_castps128_ps256(l);
+     v = _mm256_insertf128_ps(v, h, 1);
+     return v;
+}
+
+#  ifdef _MSC_VER
+#    ifndef __INTEL_COMPILER
+#      pragma warning(default : 4700)
+#      pragma runtime_checks("u", restore)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(default : 592)
+#  endif
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     __m128 h = _mm256_extractf128_ps(v, 1);
+     __m128 l = _mm256_castps256_ps128(v);
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + 3*ovs, h);
+     STOREL(x + 2*ovs, h);
+     STOREH(x + ovs, l);
+     STOREL(x, l);
+}
+
+#define STM2(x, v, ovs, aligned_like) /* no-op */
+static inline void STN2(R *x, V v0, V v1, INT ovs)
+{
+    V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
+    V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
+    __m128 h0 = _mm256_extractf128_ps(x0, 1);
+    __m128 l0 = _mm256_castps256_ps128(x0);
+    __m128 h1 = _mm256_extractf128_ps(x1, 1);
+    __m128 l1 = _mm256_castps256_ps128(x1);
+
+    *(__m128 *)(x + 3*ovs) = h1;
+    *(__m128 *)(x + 2*ovs) = h0;
+    *(__m128 *)(x + 1*ovs) = l1;
+    *(__m128 *)(x + 0*ovs) = l0;
+}
+
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+#define STN4(x, v0, v1, v2, v3, ovs)				\
+{								\
+     V xxx0, xxx1, xxx2, xxx3;					\
+     V yyy0, yyy1, yyy2, yyy3;					\
+     xxx0 = _mm256_unpacklo_ps(v0, v2);				\
+     xxx1 = _mm256_unpackhi_ps(v0, v2);				\
+     xxx2 = _mm256_unpacklo_ps(v1, v3);				\
+     xxx3 = _mm256_unpackhi_ps(v1, v3);				\
+     yyy0 = _mm256_unpacklo_ps(xxx0, xxx2);			\
+     yyy1 = _mm256_unpackhi_ps(xxx0, xxx2);			\
+     yyy2 = _mm256_unpacklo_ps(xxx1, xxx3);			\
+     yyy3 = _mm256_unpackhi_ps(xxx1, xxx3);			\
+     *(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0);	\
+     *(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1);	\
+     *(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1);	\
+     *(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1);	\
+     *(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2);	\
+     *(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1);	\
+     *(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3);	\
+     *(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1);	\
+}
+
+#else
+static inline __m128d VMOVAPD_LD(const R *x)
+{
+     /* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
+	into a 256-bit vmovapd, which requires 32-byte aligment instead of
+	16-byte alignment.
+
+	Force the use of vmovapd via asm until compilers stabilize.
+     */
+#if defined(__GNUC__)
+     __m128d var;
+     __asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
+     return var;
+#else
+     return *(const __m128d *)x;
+#endif
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     V var;
+     (void)aligned_like; /* UNUSED */
+     var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
+     var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
+     return var;
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon the store of the low
+	part occurring after the store of the high part */
+     *(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
+     *(__m128d *)x = _mm256_castpd256_pd128(v);
+}
+
+
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  Even though the comment
+   was made about __m128 parameters, it appears to apply to __m256
+   parameters as well. */
+#define STN4(x, v0, v1, v2, v3, ovs)					\
+{									\
+     V xxx0, xxx1, xxx2, xxx3;						\
+     xxx0 = _mm256_unpacklo_pd(v0, v1);					\
+     xxx1 = _mm256_unpackhi_pd(v0, v1);					\
+     xxx2 = _mm256_unpacklo_pd(v2, v3);					\
+     xxx3 = _mm256_unpackhi_pd(v2, v3);					\
+     STA(x,           _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
+     STA(x +     ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
+     STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
+     STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
+}
+#endif
+
+static inline V FLIP_RI(V x)
+{
+     return VSHUF(x, x,
+		  DS(SHUFVALD(1, 0), 
+		     SHUFVALS(1, 0, 3, 2)));
+}
+
+static inline V VCONJ(V x)
+{
+     /* Produce a SIMD vector[VL] of (0 + -0i). 
+
+        We really want to write this:
+
+           V pmpm = VLIT(-0.0, 0.0);
+
+        but historically some compilers have ignored the distiction
+        between +0 and -0.  It looks like 'gcc-8 -fast-math' treats -0
+        as 0 too.
+      */
+     union uvec {
+          unsigned u[8];
+          V v;
+     };
+     static const union uvec pmpm = {
+#ifdef FFTW_SINGLE
+          { 0x00000000, 0x80000000, 0x00000000, 0x80000000,
+            0x00000000, 0x80000000, 0x00000000, 0x80000000 }
+#else
+          { 0x00000000, 0x00000000, 0x00000000, 0x80000000,
+            0x00000000, 0x00000000, 0x00000000, 0x80000000 }
+#endif
+     };
+     return VXOR(pmpm.v, x);
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+/* FMA support */
+#define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#define VFMAI(b, c) VADD(c, VBYI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
+#else
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#endif
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+# define VTW2(v,x)							\
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+   {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
+#else
+# define VTW2(v,x)							\
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#endif
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3 VTW1
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+# define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x},	\
+  {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
+#else
+# define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}	
+#endif
+#define TWVLS (2 * VL)
+
+
+/* Use VZEROUPPER to avoid the penalty of switching from AVX to SSE.
+   See Intel Optimization Manual (April 2011, version 248966), Section
+   11.3 */
+#define VLEAVE _mm256_zeroupper
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-avx2-128.h
+++ b/fftw-3.3.10/simd-support/simd-avx2-128.h
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * 128-bit AVX2 support by Erik Lindahl, 2015.
+ * Erik Lindahl hereby places his modifications in the public domain.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "AVX2 only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _avx2_128  /* for renaming */
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__AVX2__) /* sanity check */
+#error "compiling simd-avx2-128.h without avx2 support"
+#endif
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#include <immintrin.h>
+
+typedef DS(__m128d,__m128) V;
+#define VADD SUFF(_mm_add_p)
+#define VSUB SUFF(_mm_sub_p)
+#define VMUL SUFF(_mm_mul_p)
+#define VXOR SUFF(_mm_xor_p)
+#define SHUF SUFF(_mm_shuffle_p)
+#define VPERM1 SUFF(_mm_permute_p)
+#define UNPCKL SUFF(_mm_unpacklo_p)
+#define UNPCKH SUFF(_mm_unpackhi_p)
+
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(_mm_permute_pd(x,0), _mm_moveldup_ps(x))
+#define VDUPH(x) DS(_mm_permute_pd(x,3), _mm_movehdup_ps(x))
+#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
+#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
+
+#define VLIT(x0, x1) DS(_mm_set_pd(x0, x1), _mm_set_ps(x0, x1, x0, x1))
+#define DVK(var, val) V var = VLIT(val, val)
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     *(V *)x = v;
+}
+
+#ifdef FFTW_SINGLE
+
+#  ifdef _MSC_VER
+     /* Temporarily disable the warning "uninitialized local variable
+	'name' used" and runtime checks for using a variable before it is
+	defined which is erroneously triggered by the LOADL0 / LOADH macros
+	as they only modify VAL partly each. */
+#    ifndef __INTEL_COMPILER
+#      pragma warning(disable : 4700)
+#      pragma runtime_checks("u", off)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(disable : 592)
+#  endif
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+    __m128 l0, l1;
+    (void)aligned_like; /* UNUSED */
+#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
+    l0 = LOADL(x, SUFF(_mm_undefined_p)());
+    l1 = LOADL(x + ivs, SUFF(_mm_undefined_p)());
+#else
+    l0 = LOADL(x, l0);
+    l1 = LOADL(x + ivs, l1);
+#endif
+    return SUFF(_mm_movelh_p)(l0,l1);
+}
+
+#  ifdef _MSC_VER
+#    ifndef __INTEL_COMPILER
+#      pragma warning(default : 4700)
+#      pragma runtime_checks("u", restore)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(default : 592)
+#  endif
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + ovs, v);
+     STOREL(x, v);
+}
+
+#else /* ! FFTW_SINGLE */
+#  define LD LDA
+#  define ST STA
+#endif
+
+#define STM2 DS(STA,ST)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#ifdef FFTW_SINGLE
+#  define STM4(x, v, ovs, aligned_like) /* no-op */
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  3 parameters ought to be enough
+   for anybody. */
+#  define STN4(x, v0, v1, v2, v3, ovs)			\
+{							\
+     V xxx0, xxx1, xxx2, xxx3;				\
+     xxx0 = UNPCKL(v0, v2);				\
+     xxx1 = UNPCKH(v0, v2);				\
+     xxx2 = UNPCKL(v1, v3);				\
+     xxx3 = UNPCKH(v1, v3);				\
+     STA(x, UNPCKL(xxx0, xxx2), 0, 0);			\
+     STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0);		\
+     STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0);	\
+     STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0);	\
+}
+#else /* !FFTW_SINGLE */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     STOREL(x, v);
+     STOREH(x + ovs, v);
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+static inline V FLIP_RI(V x)
+{
+     return VPERM1(x, DS(1, SHUFVALS(1, 0, 3, 2)));
+}
+
+static inline V VCONJ(V x)
+{
+     /* Produce a SIMD vector[VL] of (0 + -0i). 
+
+        We really want to write this:
+
+           V pmpm = VLIT(-0.0, 0.0);
+
+        but historically some compilers have ignored the distiction
+        between +0 and -0.  It looks like 'gcc-8 -fast-math' treats -0
+        as 0 too.
+      */
+     union uvec {
+          unsigned u[4];
+          V v;
+     };
+     static const union uvec pmpm = {
+#ifdef FFTW_SINGLE
+          { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
+#else
+          { 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
+#endif
+     };
+     return VXOR(pmpm.v, x);
+}
+
+static inline V VBYI(V x)
+{
+     x = VCONJ(x);
+     x = FLIP_RI(x);
+     return x;
+}
+
+/* FMA support */
+#define VFMA(a, b, c)  SUFF(_mm_fmadd_p)(a,b,c)
+#define VFNMS(a, b, c) SUFF(_mm_fnmadd_p)(a,b,c)
+#define VFMS(a, b, c)  SUFF(_mm_fmsub_p)(a,b,c)
+#define VFMAI(b, c)    SUFF(_mm_addsub_p)(c,FLIP_RI(b))
+#define VFNMSI(b, c)   VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) SUFF(_mm_addsub_p)(c,b)
+
+
+static inline V VZMUL(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    ti = VMUL(ti, FLIP_RI(sr));
+    return SUFF(_mm_fmaddsub_p)(tr,sr,ti);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, FLIP_RI(sr));
+     return SUFF(_mm_fmsubadd_p)(tr,sr,ti);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    tr = VMUL(tr, FLIP_RI(sr));
+    return SUFF(_mm_fmaddsub_p)(ti,sr,tr);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+static inline V BYTW1(const R *t, V sr)
+{
+    const V *twp = (const V *)t;
+    V tx = twp[0];
+    V tr = UNPCKL(tx, tx);
+    V ti = UNPCKH(tx, tx);
+    ti = VMUL(ti, FLIP_RI(sr));
+    return SUFF(_mm_fmaddsub_p)(tr,sr,ti);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+    const V *twp = (const V *)t;
+    V tx = twp[0];
+    V tr = UNPCKL(tx, tx);
+    V ti = UNPCKH(tx, tx);
+    ti = VMUL(ti, FLIP_RI(sr));
+    return SUFF(_mm_fmsubadd_p)(tr,sr,ti);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-avx2.h
+++ b/fftw-3.3.10/simd-support/simd-avx2.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * Modifications by Romain Dolbeau & Erik Lindahl, derived from simd-avx.h
+ * Romain Dolbeau hereby places his modifications in the public domain.
+ * Erik Lindahl hereby places his modifications in the public domain.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "AVX2 only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _avx2  /* for renaming */
+#define VL DS(2, 4)        /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2) 
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__AVX2__) /* sanity check */
+#error "compiling simd-avx2.h without avx2 support"
+#endif
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#include <immintrin.h>
+
+typedef DS(__m256d, __m256) V;
+#define VADD SUFF(_mm256_add_p)
+#define VSUB SUFF(_mm256_sub_p)
+#define VMUL SUFF(_mm256_mul_p)
+#define VXOR SUFF(_mm256_xor_p)
+#define VSHUF SUFF(_mm256_shuffle_p)
+#define VPERM1 SUFF(_mm256_permute_p)
+
+#define SHUFVALD(fp0,fp1) \
+   (((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(_mm256_movedup_pd(x), _mm256_moveldup_ps(x))
+#define VDUPH(x) DS(_mm256_permute_pd(x,SHUFVALD(1,1)), _mm256_movehdup_ps(x))
+
+#define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
+#define DVK(var, val) V var = VLIT(val, val)
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return SUFF(_mm256_loadu_p)(x);
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     SUFF(_mm256_storeu_p)(x, v);
+}
+
+#if FFTW_SINGLE
+
+#  ifdef _MSC_VER
+     /* Temporarily disable the warning "uninitialized local variable
+	'name' used" and runtime checks for using a variable before it is
+	defined which is erroneously triggered by the LOADL0 / LOADH macros
+	as they only modify VAL partly each. */
+#    ifndef __INTEL_COMPILER
+#      pragma warning(disable : 4700)
+#      pragma runtime_checks("u", off)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(disable : 592)
+#  endif
+
+#define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
+#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     __m128 l0, l1, h0, h1;
+     (void)aligned_like; /* UNUSED */
+#if defined(__ICC) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)
+     l0 = LOADL(x, SUFF(_mm_undefined_p)());
+     l1 = LOADL(x + ivs, SUFF(_mm_undefined_p)());
+     h0 = LOADL(x + 2*ivs, SUFF(_mm_undefined_p)());
+     h1 = LOADL(x + 3*ivs, SUFF(_mm_undefined_p)());
+#else
+     l0 = LOADL(x, l0);
+     l1 = LOADL(x + ivs, l1);
+     h0 = LOADL(x + 2*ivs, h0);
+     h1 = LOADL(x + 3*ivs, h1);
+#endif
+     l0 = SUFF(_mm_movelh_p)(l0,l1);
+     h0 = SUFF(_mm_movelh_p)(h0,h1);
+     return _mm256_insertf128_ps(_mm256_castps128_ps256(l0), h0, 1);
+}
+
+#  ifdef _MSC_VER
+#    ifndef __INTEL_COMPILER
+#      pragma warning(default : 4700)
+#      pragma runtime_checks("u", restore)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(default : 592)
+#  endif
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     __m128 h = _mm256_extractf128_ps(v, 1);
+     __m128 l = _mm256_castps256_ps128(v);
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + 3*ovs, h);
+     STOREL(x + 2*ovs, h);
+     STOREH(x + ovs, l);
+     STOREL(x, l);
+}
+
+#define STM2(x, v, ovs, aligned_like) /* no-op */
+static inline void STN2(R *x, V v0, V v1, INT ovs)
+{
+    V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
+    V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
+    __m128 h0 = _mm256_extractf128_ps(x0, 1);
+    __m128 l0 = _mm256_castps256_ps128(x0);
+    __m128 h1 = _mm256_extractf128_ps(x1, 1);
+    __m128 l1 = _mm256_castps256_ps128(x1);
+    *(__m128 *)(x + 3*ovs) = h1;
+    *(__m128 *)(x + 2*ovs) = h0;
+    *(__m128 *)(x + 1*ovs) = l1;
+    *(__m128 *)(x + 0*ovs) = l0;
+}
+
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+#define STN4(x, v0, v1, v2, v3, ovs)				\
+{								\
+     V xxx0, xxx1, xxx2, xxx3;					\
+     V yyy0, yyy1, yyy2, yyy3;					\
+     xxx0 = _mm256_unpacklo_ps(v0, v2);				\
+     xxx1 = _mm256_unpackhi_ps(v0, v2);				\
+     xxx2 = _mm256_unpacklo_ps(v1, v3);				\
+     xxx3 = _mm256_unpackhi_ps(v1, v3);				\
+     yyy0 = _mm256_unpacklo_ps(xxx0, xxx2);			\
+     yyy1 = _mm256_unpackhi_ps(xxx0, xxx2);			\
+     yyy2 = _mm256_unpacklo_ps(xxx1, xxx3);			\
+     yyy3 = _mm256_unpackhi_ps(xxx1, xxx3);			\
+     *(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0);	\
+     *(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1);	\
+     *(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1);	\
+     *(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1);	\
+     *(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2);	\
+     *(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1);	\
+     *(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3);	\
+     *(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1);	\
+}
+
+#else
+static inline __m128d VMOVAPD_LD(const R *x)
+{
+     /* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
+	into a 256-bit vmovapd, which requires 32-byte aligment instead of
+	16-byte alignment.
+
+	Force the use of vmovapd via asm until compilers stabilize.
+     */
+#if defined(__GNUC__)
+     __m128d var;
+     __asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
+     return var;
+#else
+     return *(const __m128d *)x;
+#endif
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     V var;
+     (void)aligned_like; /* UNUSED */
+     var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
+     var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
+     return var;
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon the store of the low
+	part occurring after the store of the high part */
+     *(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
+     *(__m128d *)x = _mm256_castpd256_pd128(v);
+}
+
+
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  Even though the comment
+   was made about __m128 parameters, it appears to apply to __m256
+   parameters as well. */
+#define STN4(x, v0, v1, v2, v3, ovs)					\
+{									\
+     V xxx0, xxx1, xxx2, xxx3;						\
+     xxx0 = _mm256_unpacklo_pd(v0, v1);					\
+     xxx1 = _mm256_unpackhi_pd(v0, v1);					\
+     xxx2 = _mm256_unpacklo_pd(v2, v3);					\
+     xxx3 = _mm256_unpackhi_pd(v2, v3);					\
+     STA(x,           _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
+     STA(x +     ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
+     STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
+     STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
+}
+#endif
+
+static inline V FLIP_RI(V x)
+{
+     return VPERM1(x, DS(SHUFVALD(1, 0), SHUFVALS(1, 0, 3, 2)));
+}
+
+static inline V VCONJ(V x)
+{
+     /* Produce a SIMD vector[VL] of (0 + -0i). 
+
+        We really want to write this:
+
+           V pmpm = VLIT(-0.0, 0.0);
+
+        but historically some compilers have ignored the distiction
+        between +0 and -0.  It looks like 'gcc-8 -fast-math' treats -0
+        as 0 too.
+      */
+     union uvec {
+          unsigned u[8];
+          V v;
+     };
+     static const union uvec pmpm = {
+#ifdef FFTW_SINGLE
+          { 0x00000000, 0x80000000, 0x00000000, 0x80000000,
+            0x00000000, 0x80000000, 0x00000000, 0x80000000 }
+#else
+          { 0x00000000, 0x00000000, 0x00000000, 0x80000000,
+            0x00000000, 0x00000000, 0x00000000, 0x80000000 }
+#endif
+     };
+     return VXOR(pmpm.v, x);
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+/* FMA support */
+#define VFMA    SUFF(_mm256_fmadd_p)
+#define VFNMS   SUFF(_mm256_fnmadd_p)
+#define VFMS    SUFF(_mm256_fmsub_p)
+#define VFMAI(b, c) SUFF(_mm256_addsub_p)(c, FLIP_RI(b)) /* VADD(c, VBYI(b)) */
+#define VFNMSI(b, c)   VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) SUFF(_mm256_addsub_p)(c, b)  /* VSUB(c, VCONJ(b)) */
+
+static inline V VZMUL(V tx, V sr)
+{
+     /* V tr = VDUPL(tx); */
+     /* V ti = VDUPH(tx); */
+     /* tr = VMUL(sr, tr); */
+     /* sr = VBYI(sr); */
+     /* return VFMA(ti, sr, tr); */
+     return SUFF(_mm256_fmaddsub_p)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     /* V tr = VDUPL(tx); */
+     /* V ti = VDUPH(tx); */
+     /* tr = VMUL(sr, tr); */
+     /* sr = VBYI(sr); */
+     /* return VFNMS(ti, sr, tr); */
+     return SUFF(_mm256_fmsubadd_p)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+    /*
+     * Keep the old version
+     * (2 permute, 1 shuffle, 1 constant load (L1), 1 xor, 2 fp), since the below FMA one
+     * would be 2 permute, 1 shuffle, 1 xor (setzero), 3 fp), but with a longer pipeline.
+     *
+     * Alternative new fma version:
+     * return SUFF(_mm256_addsub_p)(SUFF(_mm256_fnmadd_p)(sr, VDUPH(tx), SUFF(_mm256_setzero_p)()),
+     * VMUL(FLIP_RI(sr), VDUPL(tx)));
+    */
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     /* V tr = VDUPL(tx); */
+     /* V ti = VDUPH(tx); */
+     /* ti = VMUL(ti, sr); */
+     /* sr = VBYI(sr); */
+     /* return VFMA(tr, sr, ti); */
+     return SUFF(_mm256_fmaddsub_p)(sr, VDUPH(tx), VMUL(FLIP_RI(sr), VDUPL(tx)));
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
+#else
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#endif
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+# define VTW2(v,x)							\
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+   {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
+#else
+# define VTW2(v,x)							\
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#endif
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3 VTW1
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+# define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x},	\
+  {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
+#else
+# define VTWS(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x},	\
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}	
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE _mm256_zeroupper
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-avx512.h
+++ b/fftw-3.3.10/simd-support/simd-avx512.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * AVX-512 support implemented by Romain Dolbeau.
+ * Romain Dolbeau hereby places his modifications in the public domain.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "AVX-512 vector instructions only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## _ps
+#  define SCAL(x) x ## f
+#else /* !FFTW_SINGLE */
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## _pd
+#  define SCAL(x) x
+#endif /* FFTW_SINGLE */
+
+#define SIMD_SUFFIX  _avx512  /* for renaming */
+#define VL DS(4, 8)        /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2) 
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__AVX512F__) /* sanity check */
+#error "compiling simd-avx512.h without avx-512f support"
+#endif
+
+#if !defined(HAVE_AVX2)
+#warning "You should probably enable AVX2 with --enable-avx2 for AVX-512"
+#endif
+
+#include <immintrin.h>
+
+typedef DS(__m512d, __m512) V;
+
+#define VLIT(re, im) DS(SUFF(_mm512_setr)(im, re, im, re, im, re, im, re),SUFF(_mm512_setr)(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re))
+#define VLIT1(val) SUFF(_mm512_set1)(val)
+#define LDK(x) x
+#define DVK(var, val) V var = VLIT1(val)
+#define VZERO SUFF(_mm512_setzero)()
+
+#define VDUPL(x) DS(_mm512_movedup_pd(x),_mm512_moveldup_ps(x))
+#define VDUPH(x) DS(_mm512_unpackhi_pd(x, x),_mm512_movehdup_ps(x))
+#define FLIP_RI(x) SUFF(_mm512_shuffle)(x, x, DS(0x55,0xB1))
+#define VCONJ(x) SUFF(_mm512_fmsubadd)(VZERO, VZERO, x)
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+#define VADD(a,b) SUFF(_mm512_add)(a,b)
+#define VSUB(a,b) SUFF(_mm512_sub)(a,b)
+#define VMUL(a,b) SUFF(_mm512_mul)(a,b)
+#define VFMA(a, b, c)  SUFF(_mm512_fmadd)(a, b, c)
+#define VFMS(a, b, c)  SUFF(_mm512_fmsub)(a, b, c)
+#define VFNMS(a, b, c) SUFF(_mm512_fnmadd)(a, b, c)
+#define VFMAI(b, c)    SUFF(_mm512_fmaddsub)(VLIT1(1.), c, FLIP_RI(b))
+#define VFNMSI(b, c)   SUFF(_mm512_fmsubadd)(VLIT1(1.), c, FLIP_RI(b))
+#define VFMACONJ(b,c)  SUFF(_mm512_fmsubadd)(VLIT1(1.), c, b)
+#define VFMSCONJ(b,c)  SUFF(_mm512_fmsubadd)(VLIT1(-1.), c, b)
+#define VFNMSCONJ(b,c) SUFF(_mm512_fmaddsub)(VLIT1(1.), c, b)
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
+  (void)aligned_like; /* UNUSED */
+  (void)ivs; /* UNUSED */
+  return SUFF(_mm512_loadu)(x);
+}
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
+  (void)aligned_like; /* UNUSED */
+  (void)ovs; /* UNUSED */
+  SUFF(_mm512_storeu)(x, v);
+}
+
+#if FFTW_SINGLE
+
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
+                                   6 * ivs + 1, 6 * ivs,
+                                   5 * ivs + 1, 5 * ivs,
+                                   4 * ivs + 1, 4 * ivs,
+                                   3 * ivs + 1, 3 * ivs,
+                                   2 * ivs + 1, 2 * ivs,
+                                   1 * ivs + 1, 1 * ivs,
+                                   0 * ivs + 1, 0 * ivs);
+  
+  return _mm512_i32gather_ps(index, x, 4);
+}
+
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
+                                   6 * ovs + 1, 6 * ovs,
+                                   5 * ovs + 1, 5 * ovs,
+                                   4 * ovs + 1, 4 * ovs,
+                                   3 * ovs + 1, 3 * ovs,
+                                   2 * ovs + 1, 2 * ovs,
+                                   1 * ovs + 1, 1 * ovs,
+                                   0 * ovs + 1, 0 * ovs);
+  
+  _mm512_i32scatter_ps(x, index, v, 4);
+}
+
+#else /* !FFTW_SINGLE */
+
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m256i index = _mm256_set_epi32(3 * ivs + 1, 3 * ivs,
+                                   2 * ivs + 1, 2 * ivs,
+                                   1 * ivs + 1, 1 * ivs,
+                                   0 * ivs + 1, 0 * ivs);
+  
+  return _mm512_i32gather_pd(index, x, 8);
+}
+
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m256i index = _mm256_set_epi32(3 * ovs + 1, 3 * ovs,
+                                   2 * ovs + 1, 2 * ovs,
+                                   1 * ovs + 1, 1 * ovs,
+                                   0 * ovs + 1, 0 * ovs);
+  
+  _mm512_i32scatter_pd(x, index, v, 8);
+}
+
+#endif /* FFTW_SINGLE */
+
+#define LD LDu
+#define ST STu
+
+#ifdef FFTW_SINGLE
+#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(15 * ovs, 14 * ovs,
+                                   13 * ovs, 12 * ovs,
+                                   11 * ovs, 10 * ovs,
+                                   9 * ovs, 8 * ovs,
+                                   7 * ovs, 6 * ovs,
+                                   5 * ovs, 4 * ovs,
+                                   3 * ovs, 2 * ovs,
+                                   1 * ovs, 0 * ovs);
+  
+  _mm512_i32scatter_ps(x, index, v, 4);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
+#else /* !FFTW_SINGLE */
+#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs,
+                                   5 * ovs, 4 * ovs,
+                                   3 * ovs, 2 * ovs,
+                                   1 * ovs, 0 * ovs);
+  
+  _mm512_i32scatter_pd(x, index, v, 8);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
+#endif /* FFTW_SINGLE */
+
+static inline V VZMUL(V tx, V sr)
+{
+     /* V tr = VDUPL(tx); */
+     /* V ti = VDUPH(tx); */
+     /* tr = VMUL(sr, tr); */
+     /* sr = VBYI(sr); */
+     /* return VFMA(ti, sr, tr); */
+     return SUFF(_mm512_fmaddsub)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     /* V tr = VDUPL(tx); */
+     /* V ti = VDUPH(tx); */
+     /* tr = VMUL(sr, tr); */
+     /* sr = VBYI(sr); */
+     /* return VFNMS(ti, sr, tr); */
+     return SUFF(_mm512_fmsubadd)(sr, VDUPL(tx), VMUL(FLIP_RI(sr), VDUPH(tx)));
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+  /* return SUFF(_mm512_addsub)(SUFF(_mm512_fnmadd)(sr, VDUPH(tx), VZERO), VMUL(FLIP_RI(sr), VDUPL(tx))); */
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     /* V tr = VDUPL(tx); */
+     /* V ti = VDUPH(tx); */
+     /* ti = VMUL(ti, sr); */
+     /* sr = VBYI(sr); */
+     /* return VFMA(tr, sr, ti); */
+     return SUFF(_mm512_fmaddsub)(sr, VDUPH(tx), VMUL(FLIP_RI(sr), VDUPL(tx)));
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}
+#else /* !FFTW_SINGLE */
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
+#endif /* FFTW_SINGLE */
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+# define VTW2(v,x)							     \
+   {TW_COS, v  ,  x}, {TW_COS, v  , x}, {TW_COS, v+1,  x}, {TW_COS, v+1, x}, \
+   {TW_COS, v+2,  x}, {TW_COS, v+2, x}, {TW_COS, v+3,  x}, {TW_COS, v+3, x}, \
+   {TW_COS, v+4,  x}, {TW_COS, v+4, x}, {TW_COS, v+5,  x}, {TW_COS, v+5, x}, \
+   {TW_COS, v+6,  x}, {TW_COS, v+6, x}, {TW_COS, v+7,  x}, {TW_COS, v+7, x}, \
+   {TW_SIN, v  , -x}, {TW_SIN, v  , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+   {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+   {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}
+#else /* !FFTW_SINGLE */
+# define VTW2(v,x)							     \
+   {TW_COS, v  ,  x}, {TW_COS, v  , x}, {TW_COS, v+1,  x}, {TW_COS, v+1, x}, \
+   {TW_COS, v+2,  x}, {TW_COS, v+2, x}, {TW_COS, v+3,  x}, {TW_COS, v+3, x}, \
+   {TW_SIN, v  , -x}, {TW_SIN, v  , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
+#endif /* FFTW_SINGLE */
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+/*      V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+/*      V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3(v,x) VTW1(v,x)
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+# define VTWS(v,x)                                                            \
+  {TW_COS, v   , x}, {TW_COS, v+1 , x}, {TW_COS, v+2 , x}, {TW_COS, v+3 , x}, \
+  {TW_COS, v+4 , x}, {TW_COS, v+5 , x}, {TW_COS, v+6 , x}, {TW_COS, v+7 , x}, \
+  {TW_COS, v+8 , x}, {TW_COS, v+9 , x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+  {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+  {TW_SIN, v   , x}, {TW_SIN, v+1 , x}, {TW_SIN, v+2 , x}, {TW_SIN, v+3 , x}, \
+  {TW_SIN, v+4 , x}, {TW_SIN, v+5 , x}, {TW_SIN, v+6 , x}, {TW_SIN, v+7 , x}, \
+  {TW_SIN, v+8 , x}, {TW_SIN, v+9 , x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+  {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}
+#else /* !FFTW_SINGLE */
+# define VTWS(v,x)							  \
+  {TW_COS, v  , x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+  {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+  {TW_SIN, v  , x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+  {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
+#endif /* FFTW_SINGLE */
+#define TWVLS (2 * VL)
+
+#define VLEAVE _mm256_zeroupper
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-common.h
+++ b/fftw-3.3.10/simd-support/simd-common.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* detection of alignment.  This is complicated because a machine may
+   support multiple SIMD extensions (e.g. SSE2 and AVX) but only one
+   set of alignment contraints.  So this alignment stuff cannot be
+   defined in the SIMD header files.  Rather than defining a separate
+   set of "machine" header files, we just do this ugly ifdef here. */
+#if defined(HAVE_SSE2) || defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX_128_FMA) || defined(HAVE_AVX512)
+#  if defined(FFTW_SINGLE)
+#    define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#    define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
+#  else
+#    define ALIGNMENT 16    /* Alignment for the LD/ST macros */
+#    define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
+#  endif
+#elif defined(HAVE_ALTIVEC)
+#  define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#  define ALIGNMENTA 16   /* Alignment for the LDA/STA macros */
+#elif defined(HAVE_NEON) || defined(HAVE_VSX)
+#  define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#  define ALIGNMENTA 8    /* Alignment for the LDA/STA macros */
+#elif defined(HAVE_KCVI)
+#  if defined(FFTW_SINGLE)
+#    define ALIGNMENT 8     /* Alignment for the LD/ST macros */
+#  else
+#    define ALIGNMENT 16     /* Alignment for the LD/ST macros */
+#  endif
+#  define ALIGNMENTA 64   /* Alignment for the LDA/STA macros */
+#elif defined(HAVE_GENERIC_SIMD256)
+#  if defined(FFTW_SINGLE)
+#    define ALIGNMENT 8
+#    define ALIGNMENTA 32
+#  else
+#    define ALIGNMENT 16
+#    define ALIGNMENTA 32
+#  endif
+#elif defined(HAVE_GENERIC_SIMD128)
+#  if defined(FFTW_SINGLE)
+#    define ALIGNMENT 8
+#    define ALIGNMENTA 16
+#  else
+#    define ALIGNMENT 16
+#    define ALIGNMENTA 16
+#  endif
+#endif
+
+#if HAVE_SIMD
+#  ifndef ALIGNMENT
+#  error "ALIGNMENT not defined"
+#  endif
+#  ifndef ALIGNMENTA
+#  error "ALIGNMENTA not defined"
+#  endif
+#endif
+
+/* rename for precision and for SIMD extensions */
+#define XSIMD0(name, suffix) CONCAT(name, suffix)
+#define XSIMD(name) XSIMD0(X(name), SIMD_SUFFIX)
+#define XSIMD_STRING(x) x STRINGIZE(SIMD_SUFFIX)
+
+/* TAINT_BIT is set if pointers are not guaranteed to be multiples of
+   ALIGNMENT */
+#define TAINT_BIT 1    
+
+/* TAINT_BITA is set if pointers are not guaranteed to be multiples of
+   ALIGNMENTA */
+#define TAINT_BITA 2
+
+#define PTRINT(p) ((uintptr_t)(p))
+
+#define ALIGNED(p) \
+  (((PTRINT(UNTAINT(p)) % ALIGNMENT) == 0) && !(PTRINT(p) & TAINT_BIT))
+
+#define ALIGNEDA(p) \
+  (((PTRINT(UNTAINT(p)) % ALIGNMENTA) == 0) && !(PTRINT(p) & TAINT_BITA))
+
+#define SIMD_STRIDE_OK(x) (!(((x) * sizeof(R)) % ALIGNMENT))
+#define SIMD_STRIDE_OKA(x) (!(((x) * sizeof(R)) % ALIGNMENTA))
+#define SIMD_VSTRIDE_OK SIMD_STRIDE_OK
+
--- a/fftw-3.3.10/simd-support/simd-generic128.h
+++ b/fftw-3.3.10/simd-support/simd-generic128.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * Generic128d added by Romain Dolbeau, and turned into simd-generic128.h
+ * with single & double precision by Erik Lindahl.
+ * Romain Dolbeau hereby places his modifications in the public domain.
+ * Erik Lindahl hereby places his modifications in the public domain.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#  error "Generic simd128 only works in single or double precision"
+#endif
+
+#define SIMD_SUFFIX  _generic_simd128  /* for renaming */
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define VDUPL(x) (V){x[0],x[0],x[2],x[2]}
+#  define VDUPH(x) (V){x[1],x[1],x[3],x[3]}
+#  define DVK(var, val) V var = {val,val,val,val}
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define VDUPL(x) (V){x[0],x[0]}
+#  define VDUPH(x) (V){x[1],x[1]}
+#  define DVK(var, val) V var = {val, val}
+#endif
+
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+typedef DS(double,float) V __attribute__ ((vector_size(16)));
+
+#define VADD(a,b) ((a)+(b))
+#define VSUB(a,b) ((a)-(b))
+#define VMUL(a,b) ((a)*(b))
+
+
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     *(V *)x = v;
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+    (void)aligned_like; /* UNUSED */
+    V res;
+    res[0] = x[0];
+    res[1] = x[1];
+#ifdef FFTW_SINGLE
+    res[2] = x[ivs];
+    res[3] = x[ivs+1];
+#endif
+    return res;
+}
+
+#ifdef FFTW_SINGLE
+/* ST has to be separate due to the storage hack requiring reverse order */
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+    (void)aligned_like; /* UNUSED */
+    (void)ovs; /* UNUSED */
+    *(x + ovs    ) = v[2];
+    *(x + ovs + 1) = v[3];
+    *(x    ) = v[0];
+    *(x + 1) = v[1];
+}
+#else
+/* FFTW_DOUBLE */
+#  define ST STA
+#endif
+
+#ifdef FFTW_SINGLE
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
+{
+    *(x              ) = v0[0];
+    *(x           + 1) = v1[0];
+    *(x           + 2) = v2[0];
+    *(x           + 3) = v3[0];
+    *(x     + ovs    ) = v0[1];
+    *(x     + ovs + 1) = v1[1];
+    *(x     + ovs + 2) = v2[1];
+    *(x     + ovs + 3) = v3[1];
+    *(x + 2 * ovs    ) = v0[2];
+    *(x + 2 * ovs + 1) = v1[2];
+    *(x + 2 * ovs + 2) = v2[2];
+    *(x + 2 * ovs + 3) = v3[2];
+    *(x + 3 * ovs    ) = v0[3];
+    *(x + 3 * ovs + 1) = v1[3];
+    *(x + 3 * ovs + 2) = v2[3];
+    *(x + 3 * ovs + 3) = v3[3];
+}
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+
+#else
+/* FFTW_DOUBLE */
+
+#define STM2 STA
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     *(x) = v[0];
+     *(x+ovs) = v[1];
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+
+static inline V FLIP_RI(V x)
+{
+#ifdef FFTW_SINGLE
+    return (V){x[1],x[0],x[3],x[2]};
+#else
+    return (V){x[1],x[0]};
+#endif
+}
+
+static inline V VCONJ(V x)
+{
+#ifdef FFTW_SINGLE
+    return (V){x[0],-x[1],x[2],-x[3]};
+#else
+    return (V){x[0],-x[1]};
+#endif
+}
+
+static inline V VBYI(V x)
+{
+     x = VCONJ(x);
+     x = FLIP_RI(x);
+     return x;
+}
+
+/* FMA support */
+#define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#define VFMAI(b, c) VADD(c, VBYI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_CEXP, v, x}, {TW_CEXP, v+1, x}     
+static inline V BYTW1(const R *t, V sr)
+{
+    return VZMUL(LDA(t, 2, t), sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+    return VZMULJ(LDA(t, 2, t), sr);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)                                                     \
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},   \
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)                                                     \
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)                                                       \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)                                                       \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-generic256.h
+++ b/fftw-3.3.10/simd-support/simd-generic256.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ *
+ * Generic256d added by Romain Dolbeau, and turned into simd-generic256.h
+ * with single & double precision by Erik Lindahl.
+ * Romain Dolbeau hereby places his modifications in the public domain.
+ * Erik Lindahl hereby places his modifications in the public domain.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#  error "Generic simd256 only works in single or double precision"
+#endif
+
+#define SIMD_SUFFIX  _generic_simd256  /* for renaming */
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define VDUPL(x) {x[0],x[0],x[2],x[2],x[4],x[4],x[6],x[6]}
+#  define VDUPH(x) {x[1],x[1],x[3],x[3],x[5],x[5],x[7],x[7]}
+#  define DVK(var, val) V var = {val,val,val,val,val,val,val,val}
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define VDUPL(x) {x[0],x[0],x[2],x[2]}
+#  define VDUPH(x) {x[1],x[1],x[3],x[3]}
+#  define DVK(var, val) V var = {val, val, val, val}
+#endif
+
+#define VL DS(2,4)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))     
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+typedef DS(double,float) V __attribute__ ((vector_size(32)));
+
+#define VADD(a,b) ((a)+(b))
+#define VSUB(a,b) ((a)-(b))
+#define VMUL(a,b) ((a)*(b))
+
+#define LDK(x) x
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+    V var;
+    (void)aligned_like; /* UNUSED */
+    return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+    (void)aligned_like; /* UNUSED */
+    (void)ovs; /* UNUSED */
+    *(V *)x = v;
+}
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+    V var;
+    (void)aligned_like; /* UNUSED */
+    var[0] = x[0];
+    var[1] = x[1];
+    var[2] = x[ivs];
+    var[3] = x[ivs+1];
+#ifdef FFTW_SINGLE
+    var[4] = x[2*ivs];
+    var[5] = x[2*ivs+1];
+    var[6] = x[3*ivs];
+    var[7] = x[3*ivs+1];
+#endif
+    return var;
+}
+
+
+/* ST has to be separate due to the storage hack requiring reverse order */
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+#ifdef FFTW_SINGLE
+    *(x + 3*ovs    ) = v[6];
+    *(x + 3*ovs + 1) = v[7];
+    *(x + 2*ovs    ) = v[4];
+    *(x + 2*ovs + 1) = v[5];
+    *(x + ovs      ) = v[2];
+    *(x + ovs   + 1) = v[3];
+    *(x            ) = v[0];
+    *(x         + 1) = v[1];
+#else
+    *(x  +  ovs    ) = v[2];
+    *(x  +  ovs + 1) = v[3];
+    *(x            ) = v[0];
+    *(x         + 1) = v[1];
+#endif
+}
+
+#ifdef FFTW_SINGLE
+#define STM2(x, v, ovs, a) /* no-op */
+static inline void STN2(R *x, V v0, V v1, INT ovs)
+{
+    x[        0] = v0[0];
+    x[        1] = v0[1];
+    x[        2] = v1[0];
+    x[        3] = v1[1];
+    x[  ovs    ] = v0[2];
+    x[  ovs + 1] = v0[3];
+    x[  ovs + 2] = v1[2];
+    x[  ovs + 3] = v1[3];
+    x[2*ovs    ] = v0[4];
+    x[2*ovs + 1] = v0[5];
+    x[2*ovs + 2] = v1[4];
+    x[2*ovs + 3] = v1[5];
+    x[3*ovs    ] = v0[6];
+    x[3*ovs + 1] = v0[7];
+    x[3*ovs + 2] = v1[6];
+    x[3*ovs + 3] = v1[7];
+}
+
+#  define STM4(x, v, ovs, aligned_like) /* no-op */
+static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
+{
+    *(x              ) = v0[0];
+    *(x           + 1) = v1[0];
+    *(x           + 2) = v2[0];
+    *(x           + 3) = v3[0];
+    *(x     + ovs    ) = v0[1];
+    *(x     + ovs + 1) = v1[1];
+    *(x     + ovs + 2) = v2[1];
+    *(x     + ovs + 3) = v3[1];
+    *(x + 2 * ovs    ) = v0[2];
+    *(x + 2 * ovs + 1) = v1[2];
+    *(x + 2 * ovs + 2) = v2[2];
+    *(x + 2 * ovs + 3) = v3[2];
+    *(x + 3 * ovs    ) = v0[3];
+    *(x + 3 * ovs + 1) = v1[3];
+    *(x + 3 * ovs + 2) = v2[3];
+    *(x + 3 * ovs + 3) = v3[3];
+    *(x + 4 * ovs    ) = v0[4];
+    *(x + 4 * ovs + 1) = v1[4];
+    *(x + 4 * ovs + 2) = v2[4];
+    *(x + 4 * ovs + 3) = v3[4];
+    *(x + 5 * ovs    ) = v0[5];
+    *(x + 5 * ovs + 1) = v1[5];
+    *(x + 5 * ovs + 2) = v2[5];
+    *(x + 5 * ovs + 3) = v3[5];
+    *(x + 6 * ovs    ) = v0[6];
+    *(x + 6 * ovs + 1) = v1[6];
+    *(x + 6 * ovs + 2) = v2[6];
+    *(x + 6 * ovs + 3) = v3[6];
+    *(x + 7 * ovs    ) = v0[7];
+    *(x + 7 * ovs + 1) = v1[7];
+    *(x + 7 * ovs + 2) = v2[7];
+    *(x + 7 * ovs + 3) = v3[7];
+}
+
+#else
+/* FFTW_DOUBLE */
+
+#define STM2 ST
+#define STN2(x, v0, v1, ovs) /* nop */
+#define STM4(x, v, ovs, aligned_like) /* no-op */
+
+static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs) {
+  *(x              ) = v0[0];
+  *(x           + 1) = v1[0];
+  *(x           + 2) = v2[0];
+  *(x           + 3) = v3[0];
+  *(x     + ovs    ) = v0[1];
+  *(x     + ovs + 1) = v1[1];
+  *(x     + ovs + 2) = v2[1];
+  *(x     + ovs + 3) = v3[1];
+  *(x + 2 * ovs    ) = v0[2];
+  *(x + 2 * ovs + 1) = v1[2];
+  *(x + 2 * ovs + 2) = v2[2];
+  *(x + 2 * ovs + 3) = v3[2];
+  *(x + 3 * ovs    ) = v0[3];
+  *(x + 3 * ovs + 1) = v1[3];
+  *(x + 3 * ovs + 2) = v2[3];
+  *(x + 3 * ovs + 3) = v3[3];
+}
+#endif
+
+static inline V FLIP_RI(V x)
+{
+#ifdef FFTW_SINGLE
+    return (V){x[1],x[0],x[3],x[2],x[5],x[4],x[7],x[6]};
+#else
+    return (V){x[1],x[0],x[3],x[2]};
+#endif
+}
+
+static inline V VCONJ(V x)
+{
+#ifdef FFTW_SINGLE
+    return (x * (V){1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0});
+#else
+    return (x * (V){1.0,-1.0,1.0,-1.0});
+#endif
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+/* FMA support */
+#define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#define VFMAI(b, c) VADD(c, VBYI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
+#else
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#endif
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+# define VTW2(v,x)                                                      \
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},  \
+   {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
+#else
+# define VTW2(v,x)                                                      \
+   {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},  \
+   {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#endif
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3 VTW1
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+# define VTWS(v,x)                                                      \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+  {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+  {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
+#else
+# define VTWS(v,x)                                                      \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+  {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}  
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-kcvi.h
+++ b/fftw-3.3.10/simd-support/simd-kcvi.h
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ * 
+ * Knights Corner Vector Instruction support added by Romain Dolbeau.
+ * Romain Dolbeau hereby places his modifications in the public domain.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "Knights Corner vector instructions only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## _ps
+#  define SCAL(x) x ## f
+#else /* !FFTW_SINGLE */
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## _pd
+#  define SCAL(x) x
+#endif /* FFTW_SINGLE */
+
+#define SIMD_SUFFIX  _kcvi  /* for renaming */
+#define VL DS(4, 8)        /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) ((x) == 2) 
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+/* configuration ; KNF 0 0 0 1 0 1 */
+#define KCVI_VBYI_SINGLE_USE_MUL 0
+#define KCVI_VBYI_DOUBLE_USE_MUL 0
+#define KCVI_LD_DOUBLE_USE_UNPACK 1
+#define KCVI_ST_DOUBLE_USE_PACK 1
+#define KCVI_ST2_DOUBLE_USE_STN2 0
+#define KCVI_MULZ_USE_SWIZZLE 1
+
+#include <immintrin.h>
+
+typedef DS(__m512d, __m512) V;
+
+#define VADD(a,b) SUFF(_mm512_add)(a,b)
+#define VSUB(a,b) SUFF(_mm512_sub)(a,b)
+#define VMUL(a,b) SUFF(_mm512_mul)(a,b)
+
+#define VFMA(a, b, c) SUFF(_mm512_fmadd)(a, b, c) //VADD(c, VMUL(a, b))
+#define VFMS(a, b, c) SUFF(_mm512_fmsub)(a, b, c) //VSUB(VMUL(a, b), c)
+#define VFNMS(a, b, c) SUFF(_mm512_fnmadd)(a, b, c) //VSUB(c, VMUL(a, b))
+
+#define LDK(x) x
+#define VLIT(re, im) SUFF(_mm512_setr4)(im, re, im, re)
+#define DVK(var, val) V var = SUFF(_mm512_set1)(val)
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like) {
+  return SUFF(_mm512_load)(x);
+}
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
+  SUFF(_mm512_store)(x, v);
+}
+
+#if FFTW_SINGLE
+#define VXOR(a,b) _mm512_xor_epi32(a,b)
+
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
+                                   6 * ivs + 1, 6 * ivs,
+                                   5 * ivs + 1, 5 * ivs,
+                                   4 * ivs + 1, 4 * ivs,
+                                   3 * ivs + 1, 3 * ivs,
+                                   2 * ivs + 1, 2 * ivs,
+                                   1 * ivs + 1, 1 * ivs,
+                                   0 * ivs + 1, 0 * ivs);
+  
+  return _mm512_i32gather_ps(index, x, _MM_SCALE_4);
+}
+
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
+                                   6 * ovs + 1, 6 * ovs,
+                                   5 * ovs + 1, 5 * ovs,
+                                   4 * ovs + 1, 4 * ovs,
+                                   3 * ovs + 1, 3 * ovs,
+                                   2 * ovs + 1, 2 * ovs,
+                                   1 * ovs + 1, 1 * ovs,
+                                   0 * ovs + 1, 0 * ovs);
+  
+  _mm512_i32scatter_ps(x, index, v, _MM_SCALE_4);
+}
+
+static inline V FLIP_RI(V x)
+{
+  return (V)_mm512_shuffle_epi32((__m512i)x, _MM_PERM_CDAB);
+}
+
+#define VDUPH(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_DDBB);
+#define VDUPL(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_CCAA);
+
+#else /* !FFTW_SINGLE */
+#define VXOR(a,b) _mm512_xor_epi64(a,b)
+
+#if defined (KCVI_LD_DOUBLE_USE_UNPACK) && KCVI_LD_DOUBLE_USE_UNPACK
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  V temp;
+  /* no need for hq here */
+  temp = _mm512_mask_loadunpacklo_pd(temp, 0x0003, x + (0 * ivs));
+  temp = _mm512_mask_loadunpacklo_pd(temp, 0x000c, x + (1 * ivs));
+  temp = _mm512_mask_loadunpacklo_pd(temp, 0x0030, x + (2 * ivs));
+  temp = _mm512_mask_loadunpacklo_pd(temp, 0x00c0, x + (3 * ivs));
+  return temp;
+}
+#else
+static inline V LDu(const R *x, INT ivs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __declspec(align(64)) R temp[8]; 
+  int i;
+  for (i = 0 ; i < 4 ; i++) {
+    temp[i*2]   = x[i * ivs];
+    temp[i*2+1] = x[i * ivs + 1];
+  }
+  return _mm512_load_pd(temp);
+}
+#endif
+
+#if defined(KCVI_ST_DOUBLE_USE_PACK) && KCVI_ST_DOUBLE_USE_PACK
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  /* no need for hq here */
+  _mm512_mask_packstorelo_pd(x + (0 * ovs), 0x0003, v);
+  _mm512_mask_packstorelo_pd(x + (1 * ovs), 0x000c, v);
+  _mm512_mask_packstorelo_pd(x + (2 * ovs), 0x0030, v);
+  _mm512_mask_packstorelo_pd(x + (3 * ovs), 0x00c0, v);
+}
+#else
+static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __declspec(align(64)) R temp[8];
+  int i;
+  _mm512_store_pd(temp, v);
+  for (i = 0 ; i < 4 ; i++) {
+    x[i * ovs] = temp[i*2];
+    x[i * ovs + 1] = temp[i*2+1];
+  } 
+}
+#endif
+
+static inline V FLIP_RI(V x)
+{
+  return (V)_mm512_shuffle_epi32((__m512i)x, _MM_PERM_BADC);
+}
+
+#define VDUPH(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_DCDC);
+#define VDUPL(a) (V)_mm512_shuffle_epi32((__m512i)a, _MM_PERM_BABA);
+
+#endif /* FFTW_SINGLE */
+
+#define LD LDu
+#define ST STu
+
+#ifdef FFTW_SINGLE
+#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(15 * ovs, 14 * ovs,
+                                   13 * ovs, 12 * ovs,
+                                   11 * ovs, 10 * ovs,
+                                   9 * ovs, 8 * ovs,
+                                   7 * ovs, 6 * ovs,
+                                   5 * ovs, 4 * ovs,
+                                   3 * ovs, 2 * ovs,
+                                   1 * ovs, 0 * ovs);
+  
+  _mm512_i32scatter_ps(x, index, v, _MM_SCALE_4);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
+#else /* !FFTW_SINGLE */
+#if defined(KCVI_ST2_DOUBLE_USE_STN2) && KCVI_ST2_DOUBLE_USE_STN2
+#define STM2(x, v, ovs, a) /* no-op */
+static inline void STN2(R *x, V v0, V v1, INT ovs) {
+  /*  we start
+     AB CD EF GH -> *x (2 DBL), ovs between complex
+     IJ KL MN OP -> *(x+2) (2DBL), ovs between complex
+      and we want
+     ABIJ  EFMN -> *x (4 DBL), 2 * ovs between complex pairs
+      CDKL  GHOP -> *(x+ovs) (4DBL), 2 * ovs between complex pairs
+  */
+  V x00 = (V)_mm512_mask_permute4f128_epi32((__m512i)v0, 0xF0F0, (__m512i)v1, _MM_PERM_CDAB);
+  V x01 = (V)_mm512_mask_permute4f128_epi32((__m512i)v1, 0x0F0F, (__m512i)v0, _MM_PERM_CDAB);
+  _mm512_mask_packstorelo_pd(x + (0 * ovs) + 0, 0x000F, x00);
+/*   _mm512_mask_packstorehi_pd(x + (0 * ovs) + 8, 0x000F, x00); */
+  _mm512_mask_packstorelo_pd(x + (2 * ovs) + 0, 0x00F0, x00);
+/*   _mm512_mask_packstorehi_pd(x + (2 * ovs) + 8, 0x00F0, x00); */
+  _mm512_mask_packstorelo_pd(x + (1 * ovs) + 0, 0x000F, x01);
+/*   _mm512_mask_packstorehi_pd(x + (1 * ovs) + 8, 0x000F, x01); */
+  _mm512_mask_packstorelo_pd(x + (3 * ovs) + 0, 0x00F0, x01);
+/*   _mm512_mask_packstorehi_pd(x + (3 * ovs) + 8, 0x00F0, x01); */
+}
+#else
+#define STM2(x, v, ovs, a) ST(x, v, ovs, a)
+#define STN2(x, v0, v1, ovs) /* nop */
+#endif
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+  (void)aligned_like; /* UNUSED */
+  __m512i index = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0,
+                                   7 * ovs, 6 * ovs,
+                                   5 * ovs, 4 * ovs,
+                                   3 * ovs, 2 * ovs,
+                                   1 * ovs, 0 * ovs);
+  
+  _mm512_i32loscatter_pd(x, index, v, _MM_SCALE_8);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)  /* no-op */
+#endif /* FFTW_SINGLE */
+
+static inline V VFMAI(V b, V c) {
+  V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
+  return SUFF(_mm512_fmadd)(mpmp, SUFF(_mm512_swizzle)(b, _MM_SWIZ_REG_CDAB), c);
+}
+
+static inline V VFNMSI(V b, V c) {
+  V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
+  return SUFF(_mm512_fnmadd)(mpmp, SUFF(_mm512_swizzle)(b, _MM_SWIZ_REG_CDAB), c);
+}
+
+static inline V VFMACONJ(V b, V c) {
+  V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
+  return SUFF(_mm512_fmadd)(pmpm, b, c);
+}
+
+static inline V VFMSCONJ(V b, V c) {
+  V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
+  return SUFF(_mm512_fmsub)(pmpm, b, c);
+}
+
+static inline V VFNMSCONJ(V b, V c) {
+  V pmpm = VLIT(SCAL(-1.0), SCAL(1.0));
+  return SUFF(_mm512_fnmadd)(pmpm, b, c);
+}
+
+static inline V VCONJ(V x)
+{
+     V pmpm = VLIT(SCAL(-0.0), SCAL(0.0));
+     return (V)VXOR((__m512i)pmpm, (__m512i)x);
+}
+
+#ifdef FFTW_SINGLE
+#if defined(KCVI_VBYI_SINGLE_USE_MUL) && KCVI_VBYI_SINGLE_USE_MUL
+/* untested */
+static inline V VBYI(V x)
+{
+  V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
+  return _mm512_mul_ps(mpmp, _mm512_swizzle_ps(x, _MM_SWIZ_REG_CDAB));
+}
+#else
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+#endif
+#else /* !FFTW_SINGLE */
+#if defined(KCVI_VBYI_DOUBLE_USE_MUL) && KCVI_VBYI_DOUBLE_USE_MUL
+/* on KNF, using mul_pd is slower than shuf128x32 + xor */
+static inline V VBYI(V x)
+{
+  V mpmp = VLIT(SCAL(1.0), SCAL(-1.0));
+  return _mm512_mul_pd(mpmp, _mm512_swizzle_pd(x, _MM_SWIZ_REG_CDAB));
+}
+#else
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+#endif
+#endif /* FFTW_SINGLE */
+
+#if defined(KCVI_MULZ_USE_SWIZZLE) && KCVI_MULZ_USE_SWIZZLE
+static inline V VZMUL(V tx, V sr) /* (a,b) (c,d) */
+{
+  V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
+  V ad = SUFF(_mm512_mul)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB)); /* (a*d,b*c) */
+  V acmbd = SUFF(_mm512_sub)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (a*c-b*d, b*d-a*c) */
+  V res = SUFF(_mm512_mask_add)(acmbd, DS(0x00aa,0xaaaa), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* ([a*c+b*c] a*c-b*d, b*c+a*d) */
+  return res;
+}
+static inline V VZMULJ(V tx, V sr) /* (a,b) (c,d) */
+{
+  V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
+  V ad = SUFF(_mm512_mul)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB)); /* (a*d,b*c) */
+  V acmbd = SUFF(_mm512_add)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (a*c+b*d, b*d+a*c) */
+  V res = SUFF(_mm512_mask_subr)(acmbd, DS(0x00aa,0xaaaa), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /* ([a*c+b*c] a*c+b*d, a*d-b*c) */
+  return res;
+}
+static inline V VZMULI(V tx, V sr) /* (a,b) (c,d) */
+{
+  DVK(zero, SCAL(0.0));
+  V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
+  V ad = SUFF(_mm512_fnmadd)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB), zero); /* (-a*d,-b*c) */
+  V acmbd = SUFF(_mm512_subr)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (b*d-a*c, a*c-b*d) */
+  V res = SUFF(_mm512_mask_add)(acmbd, DS(0x0055,0x5555), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /*  (-a*d-b*c, a*c-b*d) */
+  return res;
+}
+static inline V VZMULIJ(V tx, V sr) /* (a,b) (c,d) */
+{
+  DVK(zero, SCAL(0.0));
+  V ac = SUFF(_mm512_mul)(tx, sr); /* (a*c,b*d) */
+  V ad = SUFF(_mm512_fnmadd)(tx, SUFF(_mm512_swizzle)(sr, _MM_SWIZ_REG_CDAB), zero); /* (-a*d,-b*c) */
+  V acmbd = SUFF(_mm512_add)(ac, SUFF(_mm512_swizzle)(ac, _MM_SWIZ_REG_CDAB)); /* (b*d+a*c, a*c+b*d) */
+  V res = SUFF(_mm512_mask_sub)(acmbd, DS(0x0055,0x5555), ad, SUFF(_mm512_swizzle)(ad, _MM_SWIZ_REG_CDAB)); /*  (-a*d+b*c, a*c-b*d) */
+  return res;
+}
+#else
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+#endif
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}, {TW_CEXP, v+4, x}, {TW_CEXP, v+5, x}, {TW_CEXP, v+6, x}, {TW_CEXP, v+7, x}
+#else /* !FFTW_SINGLE */
+# define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
+#endif /* FFTW_SINGLE */
+#define TWVL1 (VL)
+
+static inline V BYTW1(const R *t, V sr)
+{
+     return VZMUL(LDA(t, 2, t), sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     return VZMULJ(LDA(t, 2, t), sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+# define VTW2(v,x)							     \
+   {TW_COS, v  ,  x}, {TW_COS, v  , x}, {TW_COS, v+1,  x}, {TW_COS, v+1, x}, \
+   {TW_COS, v+2,  x}, {TW_COS, v+2, x}, {TW_COS, v+3,  x}, {TW_COS, v+3, x}, \
+   {TW_COS, v+4,  x}, {TW_COS, v+4, x}, {TW_COS, v+5,  x}, {TW_COS, v+5, x}, \
+   {TW_COS, v+6,  x}, {TW_COS, v+6, x}, {TW_COS, v+7,  x}, {TW_COS, v+7, x}, \
+   {TW_SIN, v  , -x}, {TW_SIN, v  , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}, \
+   {TW_SIN, v+4, -x}, {TW_SIN, v+4, x}, {TW_SIN, v+5, -x}, {TW_SIN, v+5, x}, \
+   {TW_SIN, v+6, -x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, -x}, {TW_SIN, v+7, x}
+#else /* !FFTW_SINGLE */
+# define VTW2(v,x)							     \
+   {TW_COS, v  ,  x}, {TW_COS, v  , x}, {TW_COS, v+1,  x}, {TW_COS, v+1, x}, \
+   {TW_COS, v+2,  x}, {TW_COS, v+2, x}, {TW_COS, v+3,  x}, {TW_COS, v+3, x}, \
+   {TW_SIN, v  , -x}, {TW_SIN, v  , x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
+   {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
+#endif /* FFTW_SINGLE */
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+/*      V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+/*      V tr = LD(t, 2, t), ti = LD(t + VL, 2, t + VL); */
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#define VTW3(v,x) VTW1(v,x)
+#define TWVL3 TWVL1
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+# define VTWS(v,x)                                                            \
+  {TW_COS, v   , x}, {TW_COS, v+1 , x}, {TW_COS, v+2 , x}, {TW_COS, v+3 , x}, \
+  {TW_COS, v+4 , x}, {TW_COS, v+5 , x}, {TW_COS, v+6 , x}, {TW_COS, v+7 , x}, \
+  {TW_COS, v+8 , x}, {TW_COS, v+9 , x}, {TW_COS, v+10, x}, {TW_COS, v+11, x}, \
+  {TW_COS, v+12, x}, {TW_COS, v+13, x}, {TW_COS, v+14, x}, {TW_COS, v+15, x}, \
+  {TW_SIN, v   , x}, {TW_SIN, v+1 , x}, {TW_SIN, v+2 , x}, {TW_SIN, v+3 , x}, \
+  {TW_SIN, v+4 , x}, {TW_SIN, v+5 , x}, {TW_SIN, v+6 , x}, {TW_SIN, v+7 , x}, \
+  {TW_SIN, v+8 , x}, {TW_SIN, v+9 , x}, {TW_SIN, v+10, x}, {TW_SIN, v+11, x}, \
+  {TW_SIN, v+12, x}, {TW_SIN, v+13, x}, {TW_SIN, v+14, x}, {TW_SIN, v+15, x}
+#else /* !FFTW_SINGLE */
+# define VTWS(v,x)							  \
+  {TW_COS, v  , x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+  {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
+  {TW_SIN, v  , x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
+  {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
+#endif /* FFTW_SINGLE */
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-neon.h
+++ b/fftw-3.3.10/simd-support/simd-neon.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * Double-precision support added by Romain Dolbeau.
+ * Romain Dolbeau hereby places his modifications in the public domain.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if !defined(FFTW_SINGLE) && !defined( __aarch64__)
+#error "NEON only works in single precision on 32 bits ARM"
+#endif
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#error "NEON only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## _f32
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## _f64
+#endif
+
+/* define these unconditionally, because they are used by
+   taint.c which is compiled without neon */
+#define SIMD_SUFFIX _neon	/* for renaming */
+#define VL DS(1,2)            /* SIMD complex vector length */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(__ARM_NEON__) && !defined(__ARM_NEON)
+#error "compiling simd-neon.h requires -mfpu=neon or equivalent"
+#endif
+
+#include <arm_neon.h>
+
+/* FIXME: I am not sure whether this code assumes little-endian
+   ordering.  VLIT may or may not be wrong for big-endian systems. */
+typedef DS(float64x2_t, float32x4_t) V;
+
+#ifdef FFTW_SINGLE
+#  define VLIT(x0, x1) {x0, x1, x0, x1}
+#else
+#  define VLIT(x0, x1) {x0, x1}
+#endif
+#define LDK(x) x
+#define DVK(var, val) const V var = VLIT(val, val)
+
+/* NEON has FMA, but a three-operand FMA is not too useful
+   for FFT purposes.  We normally compute
+
+      t0=a+b*c
+      t1=a-b*c
+
+   In a three-operand instruction set this translates into
+
+      t0=a
+      t0+=b*c
+      t1=a
+      t1-=b*c
+
+   At least one move must be implemented, negating the advantage of
+   the FMA in the first place.  At least some versions of gcc generate
+   both moves.  So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
+#if ARCH_PREFERS_FMA
+#warning "--enable-fma on NEON is probably a bad idea (see source code)"
+#endif
+
+#define VADD(a, b) SUFF(vaddq)(a, b)
+#define VSUB(a, b) SUFF(vsubq)(a, b)
+#define VMUL(a, b) SUFF(vmulq)(a, b)
+#define VFMA(a, b, c) SUFF(vmlaq)(c, a, b)	        /* a*b+c */
+#define VFNMS(a, b, c) SUFF(vmlsq)(c, a, b)	/* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
+						   in ARM terminology */
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)	/* FMS=a*b-c in powerpc terminology; no equivalent
+						   arm instruction (?) */
+
+#define STOREH(a, v) SUFF(vst1)((a), SUFF(vget_high)(v))
+#define STOREL(a, v) SUFF(vst1)((a), SUFF(vget_low)(v))
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     return SUFF(vld1q)(x);
+}
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     SUFF(vst1q)(x, v);
+}
+
+
+#ifdef FFTW_SINGLE
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     return SUFF(vcombine)(SUFF(vld1)(x), SUFF(vld1)((x + ivs)));
+}
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     /* WARNING: the extra_iter hack depends upon store-low occurring
+	after store-high */
+     STOREH(x + ovs, v);
+     STOREL(x,v);
+}
+#else /* !FFTW_SINGLE */
+#  define LD LDA
+#  define ST STA
+#endif
+
+/* 2x2 complex transpose and store */
+#define STM2 DS(STA,ST)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#ifdef FFTW_SINGLE
+/* store and 4x4 real transpose */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void) aligned_like;	/* UNUSED */
+     SUFF(vst1_lane)((x)      , SUFF(vget_low)(v), 0);
+     SUFF(vst1_lane)((x + ovs), SUFF(vget_low)(v), 1);
+     SUFF(vst1_lane)((x + 2 * ovs), SUFF(vget_high)(v), 0);
+     SUFF(vst1_lane)((x + 3 * ovs), SUFF(vget_high)(v), 1);
+}
+#define STN4(x, v0, v1, v2, v3, ovs)	/* use STM4 */
+#else /* !FFTW_SINGLE */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     STOREL(x, v);
+     STOREH(x + ovs, v);
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+#ifdef FFTW_SINGLE
+#define FLIP_RI(x) SUFF(vrev64q)(x)
+#else
+/* FIXME */
+#define FLIP_RI(x) SUFF(vcombine)(SUFF(vget_high)(x), SUFF(vget_low)(x))
+#endif
+
+static inline V VCONJ(V x)
+{
+#ifdef FFTW_SINGLE
+     static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
+     return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
+#else
+    static const uint64x2_t pm = {0, 0x8000000000000000ull};
+    /* Gcc-4.9.2 still does not include vreinterpretq_f64_u64, but simple
+     * casts generate the correct assembly.
+     */
+    return (float64x2_t)(veorq_u64((uint64x2_t)(x), (uint64x2_t)(pm)));
+#endif
+}
+
+static inline V VBYI(V x)
+{
+     return FLIP_RI(VCONJ(x));
+}
+
+static inline V VFMAI(V b, V c)
+{
+     const V mp = VLIT(-1.0, 1.0);
+     return VFMA(FLIP_RI(b), mp, c);
+}
+
+static inline V VFNMSI(V b, V c)
+{
+     const V mp = VLIT(-1.0, 1.0);
+     return VFNMS(FLIP_RI(b), mp, c);
+}
+
+static inline V VFMACONJ(V b, V c)
+{
+     const V pm = VLIT(1.0, -1.0);
+     return VFMA(b, pm, c);
+}
+
+static inline V VFNMSCONJ(V b, V c)
+{
+     const V pm = VLIT(1.0, -1.0);
+     return VFNMS(b, pm, c);
+}
+
+static inline V VFMSCONJ(V b, V c)
+{
+     return VSUB(VCONJ(b), c);
+}
+
+#ifdef FFTW_SINGLE
+#if 1
+#define VEXTRACT_REIM(tr, ti, tx)                               \
+{                                                               \
+     tr = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 0),      \
+                       SUFF(vdup_lane)(SUFF(vget_high)(tx), 0));    \
+     ti = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 1),      \
+                       SUFF(vdup_lane)(SUFF(vget_high)(tx), 1));    \
+}
+#else
+/* this alternative might be faster in an ideal world, but gcc likes
+   to spill VVV onto the stack */
+#define VEXTRACT_REIM(tr, ti, tx)               \
+{                                               \
+     float32x4x2_t vvv = SUFF(vtrnq)(tx, tx);     \
+     tr = vvv.val[0];                           \
+     ti = vvv.val[1];                           \
+}
+#endif
+#else
+#define VEXTRACT_REIM(tr, ti, tx)                               \
+{                                                               \
+  tr = SUFF(vtrn1q)(tx, tx);                                    \
+  ti = SUFF(vtrn2q)(tx, tx);                                    \
+}
+#endif
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr, ti;
+     VEXTRACT_REIM(tr, ti, tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#else
+#define VTW1(v,x) {TW_CEXP, v, x}
+#endif
+#define TWVL1 VL
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LDA(t, 2, 0);
+     return VZMUL(tx, sr);
+}
+
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LDA(t, 2, 0);
+     return VZMULJ(tx, sr);
+}
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+
+static inline V BYTW2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
+     return VFMA(ti, si, VMUL(tr, sr));
+}
+
+static inline V BYTWJ2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#else
+#  define VTW3(v,x) {TW_CEXP, v, x}
+#endif
+#  define TWVL3 (VL)
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE()		/* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-sse2.h
+++ b/fftw-3.3.10/simd-support/simd-sse2.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#  error "SSE/SSE2 only works in single/double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _sse2  /* for renaming */
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#if defined(__GNUC__) && !defined(FFTW_SINGLE) && !defined(__SSE2__)
+#  error "compiling simd-sse2.h in double precision without -msse2"
+#elif defined(__GNUC__) && defined(FFTW_SINGLE) && !defined(__SSE__)
+#  error "compiling simd-sse2.h in single precision without -msse"
+#endif
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+/* some versions of glibc's sys/cdefs.h define __inline to be empty,
+   which is wrong because emmintrin.h defines several inline
+   procedures */
+#ifndef _MSC_VER
+#undef __inline
+#endif
+
+#ifdef FFTW_SINGLE
+#  include <xmmintrin.h>
+#else
+#  include <emmintrin.h>
+#endif
+
+typedef DS(__m128d,__m128) V;
+#define VADD SUFF(_mm_add_p)
+#define VSUB SUFF(_mm_sub_p)
+#define VMUL SUFF(_mm_mul_p)
+#define VXOR SUFF(_mm_xor_p)
+#define SHUF SUFF(_mm_shuffle_p)
+#define UNPCKL SUFF(_mm_unpacklo_p)
+#define UNPCKH SUFF(_mm_unpackhi_p)
+
+#define SHUFVALS(fp0,fp1,fp2,fp3) \
+   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#define VDUPL(x) DS(UNPCKL(x, x), SHUF(x, x, SHUFVALS(0, 0, 2, 2)))
+#define VDUPH(x) DS(UNPCKH(x, x), SHUF(x, x, SHUFVALS(1, 1, 3, 3)))
+#define STOREH(a, v) DS(_mm_storeh_pd(a, v), _mm_storeh_pi((__m64 *)(a), v))
+#define STOREL(a, v) DS(_mm_storel_pd(a, v), _mm_storel_pi((__m64 *)(a), v))
+
+
+#ifdef __GNUC__
+  /*
+   * gcc-3.3 generates slow code for mm_set_ps (write all elements to
+   * the stack and load __m128 from the stack).
+   *
+   * gcc-3.[34] generates slow code for mm_set_ps1 (load into low element
+   * and shuffle).
+   *
+   * This hack forces gcc to generate a constant __m128 at compile time.
+   */
+  union rvec {
+       R r[DS(2,4)];
+       V v;
+  };
+
+#  ifdef FFTW_SINGLE
+#    define DVK(var, val) V var = __extension__ ({ \
+         static const union rvec _var = { {val,val,val,val} }; _var.v; })
+#  else
+#    define DVK(var, val) V var = __extension__ ({ \
+         static const union rvec _var = { {val,val} }; _var.v; })
+#  endif
+#  define LDK(x) x
+#else
+#  define DVK(var, val) const R var = K(val)
+#  define LDK(x) DS(_mm_set1_pd,_mm_set_ps1)(x)
+#endif
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ivs; /* UNUSED */
+     return *(const V *)x;
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     (void)ovs; /* UNUSED */
+     *(V *)x = v;
+}
+
+#ifdef FFTW_SINGLE
+
+#  ifdef _MSC_VER
+     /* Temporarily disable the warning "uninitialized local variable
+	'name' used" and runtime checks for using a variable before it is
+	defined which is erroneously triggered by the LOADL0 / LOADH macros
+	as they only modify VAL partly each. */
+#    ifndef __INTEL_COMPILER
+#      pragma warning(disable : 4700)
+#      pragma runtime_checks("u", off)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(disable : 592)
+#  endif
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+     V var;
+     (void)aligned_like; /* UNUSED */
+#  ifdef __GNUC__
+     /* We use inline asm because gcc-3.x generates slow code for
+	_mm_loadh_pi().  gcc-3.x insists upon having an existing variable for
+	VAL, which is however never used.  Thus, it generates code to move
+	values in and out the variable.  Worse still, gcc-4.0 stores VAL on
+	the stack, causing valgrind to complain about uninitialized reads. */  
+     __asm__("movlps %1, %0\n\tmovhps %2, %0"
+	     : "=x"(var) : "m"(x[0]), "m"(x[ivs]));
+#  else
+#    define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
+#    define LOADL0(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
+     var = LOADL0(x, var);
+     var = LOADH(x + ivs, var);
+#  endif
+     return var;
+}
+
+#  ifdef _MSC_VER
+#    ifndef __INTEL_COMPILER
+#      pragma warning(default : 4700)
+#      pragma runtime_checks("u", restore)
+#    endif
+#  endif
+#  ifdef __INTEL_COMPILER
+#    pragma warning(default : 592)
+#  endif
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     /* WARNING: the extra_iter hack depends upon STOREL occurring
+	after STOREH */
+     STOREH(x + ovs, v);
+     STOREL(x, v);
+}
+
+#else /* ! FFTW_SINGLE */
+#  define LD LDA
+#  define ST STA
+#endif
+
+#define STM2 DS(STA,ST)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#ifdef FFTW_SINGLE
+#  define STM4(x, v, ovs, aligned_like) /* no-op */
+/* STN4 is a macro, not a function, thanks to Visual C++ developers
+   deciding "it would be infrequent that people would want to pass more
+   than 3 [__m128 parameters] by value."  3 parameters ought to be enough
+   for anybody. */
+#  define STN4(x, v0, v1, v2, v3, ovs)			\
+{							\
+     V xxx0, xxx1, xxx2, xxx3;				\
+     xxx0 = UNPCKL(v0, v2);				\
+     xxx1 = UNPCKH(v0, v2);				\
+     xxx2 = UNPCKL(v1, v3);				\
+     xxx3 = UNPCKH(v1, v3);				\
+     STA(x, UNPCKL(xxx0, xxx2), 0, 0);			\
+     STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0);		\
+     STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0);	\
+     STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0);	\
+}
+#else /* !FFTW_SINGLE */
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     STOREL(x, v);
+     STOREH(x + ovs, v);
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+static inline V FLIP_RI(V x)
+{
+     return SHUF(x, x, DS(1, SHUFVALS(1, 0, 3, 2)));
+}
+
+static inline V VCONJ(V x)
+{
+     /* This will produce -0.0f (or -0.0d) even on broken
+        compilers that do not distinguish +0.0 from -0.0.
+        I bet some are still around. */
+     union uvec {
+          unsigned u[4];
+          V v;
+     };
+     /* it looks like gcc-3.3.5 produces slow code unless PM is
+        declared static. */
+     static const union uvec pm = {
+#ifdef FFTW_SINGLE
+          { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
+#else
+          { 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
+#endif
+     };
+     return VXOR(pm.v, x);
+}
+
+static inline V VBYI(V x)
+{
+     x = VCONJ(x);
+     x = FLIP_RI(x);
+     return x;
+}
+
+/* FMA support */
+#define VFMA(a, b, c) VADD(c, VMUL(a, b))
+#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
+#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
+#define VFMAI(b, c) VADD(c, VBYI(b))
+#define VFNMSI(b, c) VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     tr = VMUL(sr, tr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+     V tr = VDUPL(tx);
+     V ti = VDUPH(tx);
+     ti = VMUL(ti, sr);
+     sr = VBYI(sr);
+     return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V tx = twp[0];
+     V tr = UNPCKL(tx, tx);
+     V ti = UNPCKH(tx, tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V tx = twp[0];
+     V tr = UNPCKL(tx, tx);
+     V ti = UNPCKH(tx, tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFMA(tr, sr, VMUL(ti, si));
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+     const V *twp = (const V *)t;
+     V si = FLIP_RI(sr);
+     V tr = twp[0], ti = twp[1];
+     return VFNMS(ti, si, VMUL(tr, sr));
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/simd-vsx.h
+++ b/fftw-3.3.10/simd-support/simd-vsx.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * VSX SIMD implementation added 2015 Erik Lindahl.
+ * Erik Lindahl places his modifications in the public domain.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
+#  error "VSX only works in single or double precision"
+#endif
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#  define SUFF(name) name ## s
+#else
+#  define DS(d,s) d /* double-precision option */
+#  define SUFF(name) name ## d
+#endif
+
+#define SIMD_SUFFIX  _vsx  /* for renaming */
+#define VL DS(1,2)         /* SIMD vector length, in term of complex numbers */
+#define SIMD_VSTRIDE_OKA(x) DS(SIMD_STRIDE_OKA(x),((x) == 2))
+#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
+
+#include <altivec.h>
+#include <stdio.h>
+
+typedef DS(vector double,vector float) V;
+
+#define VADD(a,b)   vec_add(a,b)
+#define VSUB(a,b)   vec_sub(a,b)
+#define VMUL(a,b)   vec_mul(a,b)
+#define VXOR(a,b)   vec_xor(a,b)
+#define UNPCKL(a,b) vec_mergel(a,b)
+#define UNPCKH(a,b) vec_mergeh(a,b)
+#ifdef FFTW_SINGLE
+#    define VDUPL(a)    ({ const vector unsigned char perm = {0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}; vec_perm(a,a,perm); })
+#    define VDUPH(a)    ({ const vector unsigned char perm = {4,5,6,7,4,5,6,7,12,13,14,15,12,13,14,15}; vec_perm(a,a,perm); })
+#else
+#    define VDUPL(a)    ({ const vector unsigned char perm = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}; vec_perm(a,a,perm); })
+#    define VDUPH(a)    ({ const vector unsigned char perm = {8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15}; vec_perm(a,a,perm); })
+#endif
+
+static inline V LDK(R f) { return vec_splats(f); }
+
+#define DVK(var, val) const R var = K(val)
+
+static inline V VCONJ(V x)
+{
+  const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
+  return vec_xor(x, pmpm);
+}
+
+static inline V LDA(const R *x, INT ivs, const R *aligned_like)
+{
+#ifdef __ibmxl__
+  return vec_xl(0,(DS(double,float) *)x);
+#else
+  return (*(const V *)(x));
+#endif
+}
+
+static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
+{
+#ifdef __ibmxl__
+  vec_xst(v,0,x);
+#else
+  *(V *)x = v;
+#endif
+}
+
+static inline V FLIP_RI(V x)
+{
+#ifdef FFTW_SINGLE
+  const vector unsigned char perm = { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 };
+#else
+  const vector unsigned char perm = { 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 };
+#endif
+  return vec_perm(x,x,perm);
+}
+
+#ifdef FFTW_SINGLE
+
+static inline V LD(const R *x, INT ivs, const R *aligned_like)
+{
+  const vector unsigned char perm = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
+
+  return vec_perm((vector float)vec_splats(*(double *)(x)),
+		  (vector float)vec_splats(*(double *)(x+ivs)),perm);
+}
+
+static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
+{
+  *(double *)(x+ovs) = vec_extract( (vector double)v, 1 );
+  *(double *)x       = vec_extract( (vector double)v, 0 );
+}
+#else
+/* DOUBLE */
+
+#  define LD LDA
+#  define ST STA
+
+#endif
+
+#define STM2 DS(STA,ST)
+#define STN2(x, v0, v1, ovs) /* nop */
+
+#ifdef FFTW_SINGLE
+
+#  define STM4(x, v, ovs, aligned_like) /* no-op */
+static inline void STN4(R *x, V v0, V v1, V v2, V v3, int ovs)
+{
+    V xxx0, xxx1, xxx2, xxx3;
+    xxx0 = vec_mergeh(v0,v1);
+    xxx1 = vec_mergel(v0,v1);
+    xxx2 = vec_mergeh(v2,v3);
+    xxx3 = vec_mergel(v2,v3);
+    *(double *)x           = vec_extract( (vector double)xxx0, 0 );
+    *(double *)(x+ovs)     = vec_extract( (vector double)xxx0, 1 );
+    *(double *)(x+2*ovs)   = vec_extract( (vector double)xxx1, 0 );
+    *(double *)(x+3*ovs)   = vec_extract( (vector double)xxx1, 1 );
+    *(double *)(x+2)       = vec_extract( (vector double)xxx2, 0 );
+    *(double *)(x+ovs+2)   = vec_extract( (vector double)xxx2, 1 );
+    *(double *)(x+2*ovs+2) = vec_extract( (vector double)xxx3, 0 );
+    *(double *)(x+3*ovs+2) = vec_extract( (vector double)xxx3, 1 );
+}
+#else /* !FFTW_SINGLE */
+
+static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
+{
+     (void)aligned_like; /* UNUSED */
+     x[0]    = vec_extract(v,0);
+     x[ovs]  = vec_extract(v,1);
+}
+#  define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
+#endif
+
+static inline V VBYI(V x)
+{
+     /* FIXME [matteof 2017-09-21] It is possible to use vpermxor(),
+        but gcc and xlc treat the permutation bits differently, and
+        gcc-6 seems to generate incorrect code when using
+        __builtin_crypto_vpermxor() (i.e., VBYI() works for a small
+        test case but fails in the large).
+
+        Punt on vpermxor() for now and do the simple thing.
+     */
+     return FLIP_RI(VCONJ(x));
+}
+
+/* FMA support */
+#define VFMA(a, b, c)  vec_madd(a,b,c)
+#define VFNMS(a, b, c) vec_nmsub(a,b,c)
+#define VFMS(a, b, c)  vec_msub(a,b,c)
+#define VFMAI(b, c)    VADD(c, VBYI(b))
+#define VFNMSI(b, c)   VSUB(c, VBYI(b))
+#define VFMACONJ(b,c)  VADD(VCONJ(b),c)
+#define VFMSCONJ(b,c)  VSUB(VCONJ(b),c)
+#define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
+
+static inline V VZMUL(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    tr = VMUL(sr, tr);
+    sr = VBYI(sr);
+    return VFMA(ti, sr, tr);
+}
+
+static inline V VZMULJ(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    tr = VMUL(sr, tr);
+    sr = VBYI(sr);
+    return VFNMS(ti, sr, tr);
+}
+
+static inline V VZMULI(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    ti = VMUL(ti, sr);
+    sr = VBYI(sr);
+    return VFMS(tr, sr, ti);
+}
+
+static inline V VZMULIJ(V tx, V sr)
+{
+    V tr = VDUPL(tx);
+    V ti = VDUPH(tx);
+    ti = VMUL(ti, sr);
+    sr = VBYI(sr);
+    return VFMA(tr, sr, ti);
+}
+
+/* twiddle storage #1: compact, slower */
+#ifdef FFTW_SINGLE
+#  define VTW1(v,x)  \
+  {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LDA(t,0,t);
+     V tr = UNPCKH(tx, tx);
+     V ti = UNPCKL(tx, tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFMA(ti, sr, tr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LDA(t,0,t);
+     V tr = UNPCKH(tx, tx);
+     V ti = UNPCKL(tx, tx);
+     tr = VMUL(tr, sr);
+     sr = VBYI(sr);
+     return VFNMS(ti, sr, tr);
+}
+#else /* !FFTW_SINGLE */
+#  define VTW1(v,x) {TW_CEXP, v, x}
+static inline V BYTW1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMUL(tx, sr);
+}
+static inline V BYTWJ1(const R *t, V sr)
+{
+     V tx = LD(t, 1, t);
+     return VZMULJ(tx, sr);
+}
+#endif
+#define TWVL1 (VL)
+
+/* twiddle storage #2: twice the space, faster (when in cache) */
+#ifdef FFTW_SINGLE
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x},	\
+  {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
+#else /* !FFTW_SINGLE */
+#  define VTW2(v,x)							\
+  {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
+#endif
+#define TWVL2 (2 * VL)
+static inline V BYTW2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V ti = LDA(t+2*VL,0,t);
+     V tt = VMUL(ti, si);
+     V tr = LDA(t,0,t);
+     return VFMA(tr, sr, tt);
+}
+static inline V BYTWJ2(const R *t, V sr)
+{
+     V si = FLIP_RI(sr);
+     V tr = LDA(t,0,t);
+     V tt = VMUL(tr, sr);
+     V ti = LDA(t+2*VL,0,t);
+     return VFNMS(ti, si, tt);
+}
+
+/* twiddle storage #3 */
+#ifdef FFTW_SINGLE
+#  define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
+#  define TWVL3 (VL)
+#else
+#  define VTW3(v,x) VTW1(v,x)
+#  define TWVL3 TWVL1
+#endif
+
+/* twiddle storage for split arrays */
+#ifdef FFTW_SINGLE
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
+    {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
+#else
+#  define VTWS(v,x)							  \
+    {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
+#endif
+#define TWVLS (2 * VL)
+
+#define VLEAVE() /* nothing */
+
+#include "simd-common.h"
--- a/fftw-3.3.10/simd-support/sse2.c
+++ b/fftw-3.3.10/simd-support/sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#ifdef FFTW_SINGLE
+#  define DS(d,s) s /* single-precision option */
+#else
+#  define DS(d,s) d /* double-precision option */
+#endif
+
+#if HAVE_SSE2
+
+# if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+
+  int X(have_simd_sse2)(void)
+  {
+       return 1;
+  }
+
+# else /* !x86_64 */
+
+# include <signal.h>
+# include <setjmp.h>
+# include "x86-cpuid.h"
+
+  static jmp_buf jb;
+
+  static void sighandler(int x)
+  {
+       UNUSED(x);
+       longjmp(jb, 1);
+  }
+
+  static int sse2_works(void)
+  {
+       void (*oldsig)(int);
+       oldsig = signal(SIGILL, sighandler);
+       if (setjmp(jb)) {
+	    signal(SIGILL, oldsig);
+	    return 0;
+       } else {
+#         ifdef _MSC_VER
+	    _asm { DS(xorpd,xorps) xmm0,xmm0 }
+#         else
+	    /* asm volatile ("xorpd/s %xmm0, %xmm0"); */
+	    asm volatile(DS(".byte 0x66; .byte 0x0f; .byte 0x57; .byte 0xc0",
+			                ".byte 0x0f; .byte 0x57; .byte 0xc0"));
+#         endif
+	    signal(SIGILL, oldsig);
+	    return 1;
+       }
+  }
+
+  int X(have_simd_sse2)(void)
+  {
+       static int init = 0, res;
+
+       if (!init) {
+	    res =   !is_386() 
+		 && has_cpuid()
+		 && (cpuid_edx(1) & (1 << DS(26,25)))
+		 && sse2_works();
+	    init = 1;
+       }
+       return res;
+  }
+
+# endif
+
+#endif
--- a/fftw-3.3.10/simd-support/taint.c
+++ b/fftw-3.3.10/simd-support/taint.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+#include "simd-common.h"
+
+#if HAVE_SIMD
+
+R *X(taint)(R *p, INT s)
+{
+     if (((unsigned)s * sizeof(R)) % ALIGNMENT)
+	  p = (R *) (PTRINT(p) | TAINT_BIT);
+     if (((unsigned)s * sizeof(R)) % ALIGNMENTA)
+	  p = (R *) (PTRINT(p) | TAINT_BITA);
+     return p;
+}
+
+/* join the taint of two pointers that are supposed to be
+   the same modulo the taint */
+R *X(join_taint)(R *p1, R *p2)
+{
+     A(UNTAINT(p1) == UNTAINT(p2));
+     return (R *)(PTRINT(p1) | PTRINT(p2));
+}
+#endif
--- a/fftw-3.3.10/simd-support/vsx.c
+++ b/fftw-3.3.10/simd-support/vsx.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * VSX SIMD implementation added 2015 Erik Lindahl.
+ * Erik Lindahl places his modifications in the public domain.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "kernel/ifftw.h"
+
+#if HAVE_VSX
+
+#if HAVE_SYS_SYSCTL_H
+#  include <sys/sysctl.h>
+#endif
+
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jb;
+
+static void sighandler(int x)
+{
+     longjmp(jb, 1);
+}
+
+static int really_have_vsx(void)
+{
+     void (*oldsig)(int);
+     oldsig = signal(SIGILL, sighandler);
+     if (setjmp(jb)) {
+	  signal(SIGILL, oldsig);
+	  return 0;
+     } else {
+          float mem[2];
+          __asm__ __volatile__ ("stxsdx 0,0,%0" :: "r" (mem) : "memory" );
+	  signal(SIGILL, oldsig);
+	  return 1;
+     }
+     return 0;
+}
+
+int X(have_simd_vsx)(void)
+{
+     static int init = 0, res;
+     if (!init) {
+	  res = really_have_vsx();
+	  init = 1;
+     }
+     return res;
+}
+
+#endif
--- a/fftw-3.3.10/simd-support/x86-cpuid.h
+++ b/fftw-3.3.10/simd-support/x86-cpuid.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* this code was kindly donated by Eric J. Korpela */
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+static inline int is_386() 
+{
+#ifdef _MSC_VER
+    unsigned int result,tst;
+    _asm {
+        pushfd
+        pop eax
+        mov edx,eax
+        xor eax,40000h
+        push eax
+        popfd
+        pushfd
+        pop eax
+        push edx
+        popfd
+        mov tst,edx
+        mov result,eax
+    }
+#else
+    register unsigned int result,tst;
+    __asm__ (
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x40000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "pushl %1\n\t"
+        "popfl"
+    : "=r" (result), "=r" (tst) /* output */
+    :  /* no inputs */
+    );
+#endif
+    return (result == tst);
+}
+
+static inline int has_cpuid() 
+{
+#ifdef _MSC_VER
+    unsigned int result,tst;
+    _asm {
+        pushfd
+        pop eax
+        mov edx,eax
+        xor eax,200000h
+        push eax
+        popfd
+        pushfd
+        pop eax
+        push edx
+        popfd
+        mov tst,edx
+        mov result,eax
+    }
+#else
+    register unsigned int result,tst;
+    __asm__ (
+        "pushfl\n\t"
+        "pop %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x200000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "pushl %1\n\t"
+        "popfl"
+    : "=r" (result), "=r" (tst) /* output */
+    : /* no inputs */
+    );
+#endif
+    return (result != tst);
+}
+
+/* cpuid version to get all registers. Donated by Erik Lindahl from Gromacs. */
+static inline void
+cpuid_all(int level, int ecxval, int *eax, int *ebx, int *ecx, int *edx)
+{
+#if (defined _MSC_VER)
+    int CPUInfo[4];
+    
+#    if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
+    /* MSVC 9.0 SP1 or later */
+    __cpuidex(CPUInfo, level, ecxval);
+#    else
+    __cpuid(CPUInfo, level);
+    /* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
+#    endif
+    *eax = CPUInfo[0];
+    *ebx = CPUInfo[1];
+    *ecx = CPUInfo[2];
+    *edx = CPUInfo[3];
+
+#else
+    /* Not MSVC */
+    *eax = level;
+    *ecx = ecxval;
+    *ebx = 0;
+    *edx = 0;
+    /* Avoid clobbering global offset table in 32-bit pic code (ebx) */
+#    if defined(__PIC__)
+    __asm__ ("xchgl %%ebx, %1  \n\t"
+             "cpuid            \n\t"
+             "xchgl %%ebx, %1  \n\t"
+             : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
+#    else
+    /* No need to save ebx if we are not in pic mode */
+    __asm__ ("cpuid            \n\t"
+             : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
+#    endif
+#endif
+}
+
+static inline int cpuid_edx(int op)
+{
+#    ifdef _MSC_VER
+     int result;
+     _asm {
+	  push ebx
+          mov eax,op
+          cpuid
+          mov result,edx
+          pop ebx
+     }
+     return result;
+#    else
+     int eax, ecx, edx;
+
+     __asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
+	     : "=a" (eax), "=c" (ecx), "=d" (edx)
+	     : "a" (op));
+     return edx;
+#    endif
+}
+
+static inline int cpuid_ecx(int op)
+{
+#    ifdef _MSC_VER
+     int result;
+     _asm {
+	  push ebx
+          mov eax,op
+          cpuid
+          mov result,ecx
+          pop ebx
+     }
+     return result;
+#    else
+     int eax, ecx, edx;
+
+     __asm__("push %%ebx\n\tcpuid\n\tpop %%ebx"
+	     : "=a" (eax), "=c" (ecx), "=d" (edx)
+	     : "a" (op));
+     return ecx;
+#    endif
+}
+
+static inline int xgetbv_eax(int op)
+{
+#    ifdef _MSC_VER
+     int veax, vedx;
+     _asm {
+          mov ecx,op
+#    if defined(__INTEL_COMPILER) || (_MSC_VER >= 1600)
+          xgetbv
+#    else
+          __emit 15
+          __emit 1
+          __emit 208
+#    endif
+          mov veax,eax
+          mov vedx,edx
+     }
+     return veax;
+#    else
+     int eax, edx;
+     __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (op));
+     return eax;
+#endif
+}