Updates

2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions
--- a/fftw-3.3.10/libbench2/Makefile.am
+++ b/fftw-3.3.10/libbench2/Makefile.am
@@ -0,0 +1,18 @@
+AM_CPPFLAGS = -I $(top_srcdir)
+noinst_LIBRARIES=libbench2.a
+
+libbench2_a_SOURCES=after-ccopy-from.c after-ccopy-to.c			\
+after-hccopy-from.c after-hccopy-to.c after-rcopy-from.c		\
+after-rcopy-to.c allocate.c aset.c bench-cost-postprocess.c		\
+bench-exit.c bench-main.c can-do.c caset.c dotens2.c info.c main.c	\
+mflops.c mp.c ovtpvt.c pow2.c problem.c report.c speed.c tensor.c	\
+timer.c useropt.c util.c verify-dft.c verify-lib.c verify-r2r.c		\
+verify-rdft2.c verify.c zero.c bench-user.h bench.h verify.h		\
+my-getopt.c my-getopt.h
+
+benchmark: all
+	@echo "nothing to benchmark"
+
+accuracy: all
+	@echo "nothing to benchmark"
+
--- a/fftw-3.3.10/libbench2/Makefile.in
+++ b/fftw-3.3.10/libbench2/Makefile.in
@@ -0,0 +1,778 @@
+# Makefile.in generated by automake 1.16.3 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2020 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = libbench2
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
+	$(top_srcdir)/m4/acx_pthread.m4 \
+	$(top_srcdir)/m4/ax_cc_maxopt.m4 \
+	$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
+	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
+	$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
+	$(top_srcdir)/m4/ax_gcc_version.m4 \
+	$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LIBRARIES = $(noinst_LIBRARIES)
+ARFLAGS = cru
+AM_V_AR = $(am__v_AR_@AM_V@)
+am__v_AR_ = $(am__v_AR_@AM_DEFAULT_V@)
+am__v_AR_0 = @echo "  AR      " $@;
+am__v_AR_1 = 
+libbench2_a_AR = $(AR) $(ARFLAGS)
+libbench2_a_LIBADD =
+am_libbench2_a_OBJECTS = after-ccopy-from.$(OBJEXT) \
+	after-ccopy-to.$(OBJEXT) after-hccopy-from.$(OBJEXT) \
+	after-hccopy-to.$(OBJEXT) after-rcopy-from.$(OBJEXT) \
+	after-rcopy-to.$(OBJEXT) allocate.$(OBJEXT) aset.$(OBJEXT) \
+	bench-cost-postprocess.$(OBJEXT) bench-exit.$(OBJEXT) \
+	bench-main.$(OBJEXT) can-do.$(OBJEXT) caset.$(OBJEXT) \
+	dotens2.$(OBJEXT) info.$(OBJEXT) main.$(OBJEXT) \
+	mflops.$(OBJEXT) mp.$(OBJEXT) ovtpvt.$(OBJEXT) pow2.$(OBJEXT) \
+	problem.$(OBJEXT) report.$(OBJEXT) speed.$(OBJEXT) \
+	tensor.$(OBJEXT) timer.$(OBJEXT) useropt.$(OBJEXT) \
+	util.$(OBJEXT) verify-dft.$(OBJEXT) verify-lib.$(OBJEXT) \
+	verify-r2r.$(OBJEXT) verify-rdft2.$(OBJEXT) verify.$(OBJEXT) \
+	zero.$(OBJEXT) my-getopt.$(OBJEXT)
+libbench2_a_OBJECTS = $(am_libbench2_a_OBJECTS)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = ./$(DEPDIR)/after-ccopy-from.Po \
+	./$(DEPDIR)/after-ccopy-to.Po ./$(DEPDIR)/after-hccopy-from.Po \
+	./$(DEPDIR)/after-hccopy-to.Po ./$(DEPDIR)/after-rcopy-from.Po \
+	./$(DEPDIR)/after-rcopy-to.Po ./$(DEPDIR)/allocate.Po \
+	./$(DEPDIR)/aset.Po ./$(DEPDIR)/bench-cost-postprocess.Po \
+	./$(DEPDIR)/bench-exit.Po ./$(DEPDIR)/bench-main.Po \
+	./$(DEPDIR)/can-do.Po ./$(DEPDIR)/caset.Po \
+	./$(DEPDIR)/dotens2.Po ./$(DEPDIR)/info.Po ./$(DEPDIR)/main.Po \
+	./$(DEPDIR)/mflops.Po ./$(DEPDIR)/mp.Po \
+	./$(DEPDIR)/my-getopt.Po ./$(DEPDIR)/ovtpvt.Po \
+	./$(DEPDIR)/pow2.Po ./$(DEPDIR)/problem.Po \
+	./$(DEPDIR)/report.Po ./$(DEPDIR)/speed.Po \
+	./$(DEPDIR)/tensor.Po ./$(DEPDIR)/timer.Po \
+	./$(DEPDIR)/useropt.Po ./$(DEPDIR)/util.Po \
+	./$(DEPDIR)/verify-dft.Po ./$(DEPDIR)/verify-lib.Po \
+	./$(DEPDIR)/verify-r2r.Po ./$(DEPDIR)/verify-rdft2.Po \
+	./$(DEPDIR)/verify.Po ./$(DEPDIR)/zero.Po
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libbench2_a_SOURCES)
+DIST_SOURCES = $(libbench2_a_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AVX2_CFLAGS = @AVX2_CFLAGS@
+AVX512_CFLAGS = @AVX512_CFLAGS@
+AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
+AVX_CFLAGS = @AVX_CFLAGS@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHECK_PL_OPTS = @CHECK_PL_OPTS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
+C_MPI_FINT = @C_MPI_FINT@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+F77 = @F77@
+FFLAGS = @FFLAGS@
+FGREP = @FGREP@
+FLIBS = @FLIBS@
+GREP = @GREP@
+INDENT = @INDENT@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KCVI_CFLAGS = @KCVI_CFLAGS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBQUADMATH = @LIBQUADMATH@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPICC = @MPICC@
+MPILIBS = @MPILIBS@
+MPIRUN = @MPIRUN@
+NEON_CFLAGS = @NEON_CFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCAMLBUILD = @OCAMLBUILD@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+POW_LIB = @POW_LIB@
+PRECISION = @PRECISION@
+PREC_SUFFIX = @PREC_SUFFIX@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
+SHELL = @SHELL@
+SSE2_CFLAGS = @SSE2_CFLAGS@
+STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
+STRIP = @STRIP@
+THREADLIBS = @THREADLIBS@
+VERSION = @VERSION@
+VSX_CFLAGS = @VSX_CFLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+ac_ct_F77 = @ac_ct_F77@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+runstatedir = @runstatedir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -I $(top_srcdir)
+noinst_LIBRARIES = libbench2.a
+libbench2_a_SOURCES = after-ccopy-from.c after-ccopy-to.c			\
+after-hccopy-from.c after-hccopy-to.c after-rcopy-from.c		\
+after-rcopy-to.c allocate.c aset.c bench-cost-postprocess.c		\
+bench-exit.c bench-main.c can-do.c caset.c dotens2.c info.c main.c	\
+mflops.c mp.c ovtpvt.c pow2.c problem.c report.c speed.c tensor.c	\
+timer.c useropt.c util.c verify-dft.c verify-lib.c verify-r2r.c		\
+verify-rdft2.c verify.c zero.c bench-user.h bench.h verify.h		\
+my-getopt.c my-getopt.h
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libbench2/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu libbench2/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLIBRARIES:
+	-test -z "$(noinst_LIBRARIES)" || rm -f $(noinst_LIBRARIES)
+
+libbench2.a: $(libbench2_a_OBJECTS) $(libbench2_a_DEPENDENCIES) $(EXTRA_libbench2_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libbench2.a
+	$(AM_V_AR)$(libbench2_a_AR) libbench2.a $(libbench2_a_OBJECTS) $(libbench2_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libbench2.a
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-ccopy-from.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-ccopy-to.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-hccopy-from.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-hccopy-to.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-rcopy-from.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/after-rcopy-to.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/allocate.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aset.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-cost-postprocess.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-exit.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bench-main.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/can-do.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/caset.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dotens2.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/info.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mflops.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mp.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/my-getopt.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ovtpvt.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pow2.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/report.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/speed.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tensor.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/timer.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/useropt.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/util.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-dft.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-lib.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-r2r.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify-rdft2.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/verify.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/zero.Po@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+		-rm -f ./$(DEPDIR)/after-ccopy-from.Po
+	-rm -f ./$(DEPDIR)/after-ccopy-to.Po
+	-rm -f ./$(DEPDIR)/after-hccopy-from.Po
+	-rm -f ./$(DEPDIR)/after-hccopy-to.Po
+	-rm -f ./$(DEPDIR)/after-rcopy-from.Po
+	-rm -f ./$(DEPDIR)/after-rcopy-to.Po
+	-rm -f ./$(DEPDIR)/allocate.Po
+	-rm -f ./$(DEPDIR)/aset.Po
+	-rm -f ./$(DEPDIR)/bench-cost-postprocess.Po
+	-rm -f ./$(DEPDIR)/bench-exit.Po
+	-rm -f ./$(DEPDIR)/bench-main.Po
+	-rm -f ./$(DEPDIR)/can-do.Po
+	-rm -f ./$(DEPDIR)/caset.Po
+	-rm -f ./$(DEPDIR)/dotens2.Po
+	-rm -f ./$(DEPDIR)/info.Po
+	-rm -f ./$(DEPDIR)/main.Po
+	-rm -f ./$(DEPDIR)/mflops.Po
+	-rm -f ./$(DEPDIR)/mp.Po
+	-rm -f ./$(DEPDIR)/my-getopt.Po
+	-rm -f ./$(DEPDIR)/ovtpvt.Po
+	-rm -f ./$(DEPDIR)/pow2.Po
+	-rm -f ./$(DEPDIR)/problem.Po
+	-rm -f ./$(DEPDIR)/report.Po
+	-rm -f ./$(DEPDIR)/speed.Po
+	-rm -f ./$(DEPDIR)/tensor.Po
+	-rm -f ./$(DEPDIR)/timer.Po
+	-rm -f ./$(DEPDIR)/useropt.Po
+	-rm -f ./$(DEPDIR)/util.Po
+	-rm -f ./$(DEPDIR)/verify-dft.Po
+	-rm -f ./$(DEPDIR)/verify-lib.Po
+	-rm -f ./$(DEPDIR)/verify-r2r.Po
+	-rm -f ./$(DEPDIR)/verify-rdft2.Po
+	-rm -f ./$(DEPDIR)/verify.Po
+	-rm -f ./$(DEPDIR)/zero.Po
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+		-rm -f ./$(DEPDIR)/after-ccopy-from.Po
+	-rm -f ./$(DEPDIR)/after-ccopy-to.Po
+	-rm -f ./$(DEPDIR)/after-hccopy-from.Po
+	-rm -f ./$(DEPDIR)/after-hccopy-to.Po
+	-rm -f ./$(DEPDIR)/after-rcopy-from.Po
+	-rm -f ./$(DEPDIR)/after-rcopy-to.Po
+	-rm -f ./$(DEPDIR)/allocate.Po
+	-rm -f ./$(DEPDIR)/aset.Po
+	-rm -f ./$(DEPDIR)/bench-cost-postprocess.Po
+	-rm -f ./$(DEPDIR)/bench-exit.Po
+	-rm -f ./$(DEPDIR)/bench-main.Po
+	-rm -f ./$(DEPDIR)/can-do.Po
+	-rm -f ./$(DEPDIR)/caset.Po
+	-rm -f ./$(DEPDIR)/dotens2.Po
+	-rm -f ./$(DEPDIR)/info.Po
+	-rm -f ./$(DEPDIR)/main.Po
+	-rm -f ./$(DEPDIR)/mflops.Po
+	-rm -f ./$(DEPDIR)/mp.Po
+	-rm -f ./$(DEPDIR)/my-getopt.Po
+	-rm -f ./$(DEPDIR)/ovtpvt.Po
+	-rm -f ./$(DEPDIR)/pow2.Po
+	-rm -f ./$(DEPDIR)/problem.Po
+	-rm -f ./$(DEPDIR)/report.Po
+	-rm -f ./$(DEPDIR)/speed.Po
+	-rm -f ./$(DEPDIR)/tensor.Po
+	-rm -f ./$(DEPDIR)/timer.Po
+	-rm -f ./$(DEPDIR)/useropt.Po
+	-rm -f ./$(DEPDIR)/util.Po
+	-rm -f ./$(DEPDIR)/verify-dft.Po
+	-rm -f ./$(DEPDIR)/verify-lib.Po
+	-rm -f ./$(DEPDIR)/verify-r2r.Po
+	-rm -f ./$(DEPDIR)/verify-rdft2.Po
+	-rm -f ./$(DEPDIR)/verify.Po
+	-rm -f ./$(DEPDIR)/zero.Po
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
+	clean-generic clean-libtool clean-noinstLIBRARIES \
+	cscopelist-am ctags ctags-am distclean distclean-compile \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+benchmark: all
+	@echo "nothing to benchmark"
+
+accuracy: all
+	@echo "nothing to benchmark"
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
--- a/fftw-3.3.10/libbench2/after-ccopy-from.c
+++ b/fftw-3.3.10/libbench2/after-ccopy-from.c
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(p);
+     UNUSED(ri);
+     UNUSED(ii);
+}
--- a/fftw-3.3.10/libbench2/after-ccopy-to.c
+++ b/fftw-3.3.10/libbench2/after-ccopy-to.c
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(p);
+     UNUSED(ro);
+     UNUSED(io);
+}
--- a/fftw-3.3.10/libbench2/after-hccopy-from.c
+++ b/fftw-3.3.10/libbench2/after-hccopy-from.c
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(p);
+     UNUSED(ri);
+     UNUSED(ii);
+}
--- a/fftw-3.3.10/libbench2/after-hccopy-to.c
+++ b/fftw-3.3.10/libbench2/after-hccopy-to.c
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(p);
+     UNUSED(ro);
+     UNUSED(io);
+}
--- a/fftw-3.3.10/libbench2/after-rcopy-from.c
+++ b/fftw-3.3.10/libbench2/after-rcopy-from.c
@@ -0,0 +1,9 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
+{
+     UNUSED(p);
+     UNUSED(ri);
+}
--- a/fftw-3.3.10/libbench2/after-rcopy-to.c
+++ b/fftw-3.3.10/libbench2/after-rcopy-to.c
@@ -0,0 +1,9 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
+{
+     UNUSED(p);
+     UNUSED(ro);
+}
--- a/fftw-3.3.10/libbench2/allocate.c
+++ b/fftw-3.3.10/libbench2/allocate.c
@@ -0,0 +1,110 @@
+/* not worth copyrighting */
+
+
+#include "libbench2/bench.h"
+
+static void bounds(bench_problem *p, int *ilb, int *iub, int *olb, int *oub)
+{
+     bench_tensor *t = tensor_append(p->sz, p->vecsz);
+     tensor_ibounds(t, ilb, iub);
+     tensor_obounds(t, olb, oub);
+     tensor_destroy(t);
+}
+
+/*
+ * Allocate I/O arrays for a problem.
+ *
+ * This is the default routine that can be overridden by the user in
+ * complicated cases.
+ */
+void problem_alloc(bench_problem *p)
+{
+     int ilb, iub, olb, oub;
+     int isz, osz;
+
+     bounds(p, &ilb, &iub, &olb, &oub);
+     isz = iub - ilb;
+     osz = oub - olb;
+
+     if (p->kind == PROBLEM_COMPLEX) {
+	  bench_complex *in, *out;
+
+	  p->iphyssz = isz;
+	  p->inphys = in = (bench_complex *) bench_malloc(isz * sizeof(bench_complex));
+	  p->in = in - ilb;
+	  
+	  if (p->in_place) {
+	       p->out = p->in;
+	       p->outphys = p->inphys;
+	       p->ophyssz = p->iphyssz;
+	  } else {
+	       p->ophyssz = osz;
+	       p->outphys = out = (bench_complex *) bench_malloc(osz * sizeof(bench_complex));
+	       p->out = out - olb;
+	  }
+     } else if (p->kind == PROBLEM_R2R) {
+	  bench_real *in, *out;
+
+	  p->iphyssz = isz;
+	  p->inphys = in = (bench_real *) bench_malloc(isz * sizeof(bench_real));
+	  p->in = in - ilb;
+	  
+	  if (p->in_place) {
+	       p->out = p->in;
+	       p->outphys = p->inphys;
+	       p->ophyssz = p->iphyssz;
+	  } else {
+	       p->ophyssz = osz;
+	       p->outphys = out = (bench_real *) bench_malloc(osz * sizeof(bench_real));
+	       p->out = out - olb;
+	  }
+     } else if (p->kind == PROBLEM_REAL && p->sign < 0) { /* R2HC */
+	  bench_real *in;
+	  bench_complex *out;
+
+	  isz = isz > osz*2 ? isz : osz*2;
+	  p->iphyssz = isz;
+	  p->inphys = in = (bench_real *) bench_malloc(p->iphyssz * sizeof(bench_real));
+	  p->in = in - ilb;
+	  
+	  if (p->in_place) {
+	       p->out = p->in;
+	       p->outphys = p->inphys;
+	       p->ophyssz = p->iphyssz / 2;
+	  } else {
+	       p->ophyssz = osz;
+	       p->outphys = out = (bench_complex *) bench_malloc(osz * sizeof(bench_complex));
+	       p->out = out - olb;
+	  }
+     } else if (p->kind == PROBLEM_REAL && p->sign > 0) { /* HC2R */
+	  bench_real *out;
+	  bench_complex *in;
+
+	  osz = osz > isz*2 ? osz : isz*2;
+	  p->ophyssz = osz;
+	  p->outphys = out = (bench_real *) bench_malloc(p->ophyssz * sizeof(bench_real));
+	  p->out = out - olb;
+	  
+	  if (p->in_place) {
+	       p->in = p->out;
+	       p->inphys = p->outphys;
+	       p->iphyssz = p->ophyssz / 2;
+	  } else {
+	       p->iphyssz = isz;
+	       p->inphys = in = (bench_complex *) bench_malloc(isz * sizeof(bench_complex));
+	       p->in = in - ilb;
+	  }
+     } else {
+	  BENCH_ASSERT(0); /* TODO */
+     }
+}
+
+void problem_free(bench_problem *p)
+{
+     if (p->outphys && p->outphys != p->inphys)
+	  bench_free(p->outphys);
+     if (p->inphys)
+	  bench_free(p->inphys);
+     tensor_destroy(p->sz);
+     tensor_destroy(p->vecsz);
+}
--- a/fftw-3.3.10/libbench2/aset.c
+++ b/fftw-3.3.10/libbench2/aset.c
@@ -0,0 +1,10 @@
+/* not worth copyrighting */
+
+#include "libbench2/bench.h"
+
+void aset(bench_real *A, int n, bench_real x)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  A[i] = x;
+}
--- a/fftw-3.3.10/libbench2/bench-cost-postprocess.c
+++ b/fftw-3.3.10/libbench2/bench-cost-postprocess.c
@@ -0,0 +1,8 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+double bench_cost_postprocess(double cost)
+{
+     return cost;
+}
--- a/fftw-3.3.10/libbench2/bench-exit.c
+++ b/fftw-3.3.10/libbench2/bench-exit.c
@@ -0,0 +1,8 @@
+/* not worth copyrighting */
+#include "libbench2/bench.h"
+
+/* default routine, can be overridden by user */
+void bench_exit(int status)
+{
+     exit(status);
+}
--- a/fftw-3.3.10/libbench2/bench-main.c
+++ b/fftw-3.3.10/libbench2/bench-main.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+#include "my-getopt.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int verbose;
+
+static const struct my_option options[] =
+{
+  {"accuracy", REQARG, 'a'},
+  {"accuracy-rounds", REQARG, 405},
+  {"impulse-accuracy-rounds", REQARG, 406},
+  {"can-do", REQARG, 'd'},
+  {"help", NOARG, 'h'},
+  {"info", REQARG, 'i'},
+  {"info-all", NOARG, 'I'},
+  {"print-precision", NOARG, 402},
+  {"print-time-min", NOARG, 400},
+  {"random-seed", REQARG, 404},
+  {"report-benchmark", NOARG, 320},
+  {"report-mflops", NOARG, 300},
+  {"report-time", NOARG, 310},
+  {"report-verbose", NOARG, 330},
+  {"speed", REQARG, 's'},
+  {"setup-speed", REQARG, 'S'},
+  {"time-min", REQARG, 't'},
+  {"time-repeat", REQARG, 'r'},
+  {"user-option", REQARG, 'o'},
+  {"verbose", OPTARG, 'v'},
+  {"verify", REQARG, 'y'},
+  {"verify-rounds", REQARG, 401},
+  {"verify-tolerance", REQARG, 403},
+  {0, NOARG, 0}
+};
+
+int bench_main(int argc, char *argv[])
+{
+     double tmin = 0.0;
+     double tol;
+     int repeat = 0;
+     int rounds = 10;
+     int iarounds = 0;
+     int arounds = 1; /* this is too low for precise results */
+     int c;
+
+     report = report_verbose; /* default */
+     verbose = 0;
+
+     tol = SINGLE_PRECISION ? 1.0e-3 : (QUAD_PRECISION ? 1e-29 : 1.0e-10);
+
+     main_init(&argc, &argv);
+
+     bench_srand(1);
+
+     while ((c = my_getopt (argc, argv, options)) != -1) {
+	  switch (c) {
+	      case 't' :
+		   tmin = strtod(my_optarg, 0);
+		   break;
+	      case 'r':
+		   repeat = atoi(my_optarg);
+		   break;
+	      case 's':
+		   timer_init(tmin, repeat);
+		   speed(my_optarg, 0);
+		   break;
+	      case 'S':
+		   timer_init(tmin, repeat);
+		   speed(my_optarg, 1);
+		   break;
+	      case 'd':
+		   report_can_do(my_optarg);
+		   break;
+	      case 'o':
+		   useropt(my_optarg);
+		   break;
+	      case 'v':
+		   if (verbose >= 0) { /* verbose < 0 disables output */
+			if (my_optarg)
+			     verbose = atoi(my_optarg);
+			else
+			     ++verbose;
+		   }
+		   break;
+	      case 'y':
+		   verify(my_optarg, rounds, tol);
+		   break;
+	      case 'a':
+		   accuracy(my_optarg, arounds, iarounds);
+		   break;
+	      case 'i':
+		   report_info(my_optarg);
+		   break;
+	      case 'I':
+		   report_info_all();
+		   break;
+	      case 'h':
+		   if (verbose >= 0) my_usage(argv[0], options);
+		   break;
+
+	      case 300: /* --report-mflops */
+		   report = report_mflops;
+		   break;
+
+	      case 310: /* --report-time */
+		   report = report_time;
+		   break;
+
+ 	      case 320: /* --report-benchmark */
+		   report = report_benchmark;
+		   break;
+
+ 	      case 330: /* --report-verbose */
+		   report = report_verbose;
+		   break;
+
+	      case 400: /* --print-time-min */
+		   timer_init(tmin, repeat);
+		   ovtpvt("%g\n", time_min);
+		   break;
+
+	      case 401: /* --verify-rounds */
+		   rounds = atoi(my_optarg);
+		   break;
+
+	      case 402: /* --print-precision */
+		   if (SINGLE_PRECISION)
+			ovtpvt("single\n");
+		   else if (QUAD_PRECISION)
+			ovtpvt("quad\n");
+		   else if (LDOUBLE_PRECISION)
+			ovtpvt("long-double\n");
+		   else if (DOUBLE_PRECISION)
+			ovtpvt("double\n");
+		   else 
+			ovtpvt("unknown %d\n", sizeof(bench_real));
+		   break;
+
+	      case 403: /* --verify-tolerance */
+		   tol = strtod(my_optarg, 0);
+		   break;
+
+	      case 404: /* --random-seed */
+		   bench_srand(atoi(my_optarg));
+		   break;
+
+	      case 405: /* --accuracy-rounds */
+		   arounds = atoi(my_optarg);
+		   break;
+		   
+	      case 406: /* --impulse-accuracy-rounds */
+		   iarounds = atoi(my_optarg);
+		   break;
+		   
+	      case '?':
+		   /* my_getopt() already printed an error message. */
+		   cleanup();
+		   return 1;
+
+	      default:
+		   abort ();
+	  }
+     }
+
+     /* assume that any remaining arguments are problems to be
+        benchmarked */
+     while (my_optind < argc) {
+	  timer_init(tmin, repeat);
+	  speed(argv[my_optind++], 0);
+     }
+
+     cleanup();
+     return 0;
+}
--- a/fftw-3.3.10/libbench2/bench-user.h
+++ b/fftw-3.3.10/libbench2/bench-user.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __BENCH_USER_H__
+#define __BENCH_USER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif                          /* __cplusplus */
+
+/* benchmark program definitions for user code */
+#include "config.h"
+#include <limits.h>
+
+#if HAVE_STDDEF_H
+#include <stddef.h>
+#endif
+
+#if HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+#if defined(BENCHFFT_SINGLE)
+typedef float bench_real;
+#elif defined(BENCHFFT_LDOUBLE)
+typedef long double bench_real;
+#elif defined(BENCHFFT_QUAD)
+typedef __float128 bench_real;
+#else
+typedef double bench_real;
+#endif
+
+typedef bench_real bench_complex[2];
+
+#define c_re(c)  ((c)[0])
+#define c_im(c)  ((c)[1])
+
+#undef DOUBLE_PRECISION
+#define DOUBLE_PRECISION (sizeof(bench_real) == sizeof(double))
+#undef SINGLE_PRECISION
+#define SINGLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(float))
+#undef LDOUBLE_PRECISION
+#define LDOUBLE_PRECISION (!DOUBLE_PRECISION && sizeof(bench_real) == sizeof(long double))
+
+#undef QUAD_PRECISION
+#ifdef BENCHFFT_QUAD
+#define QUAD_PRECISION (!LDOUBLE_PRECISION && sizeof(bench_real) == sizeof(__float128))
+#else
+#define QUAD_PRECISION 0
+#endif
+
+typedef enum { PROBLEM_COMPLEX, PROBLEM_REAL, PROBLEM_R2R } problem_kind_t;
+
+typedef enum {
+     R2R_R2HC, R2R_HC2R, R2R_DHT,
+     R2R_REDFT00, R2R_REDFT01, R2R_REDFT10, R2R_REDFT11,
+     R2R_RODFT00, R2R_RODFT01, R2R_RODFT10, R2R_RODFT11
+} r2r_kind_t;
+
+typedef struct {
+     int n;
+     int is;			/* input stride */
+     int os;			/* output stride */
+} bench_iodim;
+
+typedef struct {
+     int rnk;
+     bench_iodim *dims;
+} bench_tensor;
+
+bench_tensor *mktensor(int rnk);
+void tensor_destroy(bench_tensor *sz);
+size_t tensor_sz(const bench_tensor *sz);
+bench_tensor *tensor_compress(const bench_tensor *sz);
+int tensor_unitstridep(bench_tensor *t);
+int tensor_rowmajorp(bench_tensor *t);
+int tensor_real_rowmajorp(bench_tensor *t, int sign, int in_place);
+bench_tensor *tensor_append(const bench_tensor *a, const bench_tensor *b);
+bench_tensor *tensor_copy(const bench_tensor *sz);
+bench_tensor *tensor_copy_sub(const bench_tensor *sz, int start_dim, int rnk);
+bench_tensor *tensor_copy_swapio(const bench_tensor *sz);
+void tensor_ibounds(bench_tensor *t, int *lbp, int *ubp);
+void tensor_obounds(bench_tensor *t, int *lbp, int *ubp);
+
+/*
+  Definition of rank -infinity.
+  This definition has the property that if you want rank 0 or 1,
+  you can simply test for rank <= 1.  This is a common case.
+ 
+  A tensor of rank -infinity has size 0.
+*/
+#define BENCH_RNK_MINFTY  INT_MAX
+#define BENCH_FINITE_RNK(rnk) ((rnk) != BENCH_RNK_MINFTY)
+
+typedef struct {
+     problem_kind_t kind;
+     r2r_kind_t *k;
+     bench_tensor *sz;
+     bench_tensor *vecsz;
+     int sign;
+     int in_place;
+     int destroy_input;
+     int split;
+     void *in, *out;
+     void *inphys, *outphys;
+     int iphyssz, ophyssz;
+     char *pstring;
+     void *userinfo; /* user can store whatever */
+     int scrambled_in, scrambled_out; /* hack for MPI */
+
+     /* internal hack so that we can use verifier in FFTW test program */
+     void *ini, *outi; /* if nonzero, point to imag. parts for dft */
+
+     /* another internal hack to avoid passing around too many parameters */
+     double setup_time;
+} bench_problem;
+
+extern int verbose;
+
+extern int no_speed_allocation;
+
+extern int always_pad_real;
+
+#define LIBBENCH_TIMER 0
+#define USER_TIMER 1
+#define BENCH_NTIMERS 2
+extern void timer_start(int which_timer);
+extern double timer_stop(int which_timer);
+
+extern int can_do(bench_problem *p);
+extern void setup(bench_problem *p);
+extern void doit(int iter, bench_problem *p);
+extern void done(bench_problem *p);
+extern void main_init(int *argc, char ***argv);
+extern void cleanup(void);
+extern void verify(const char *param, int rounds, double tol);
+extern void useropt(const char *arg);
+
+extern void verify_problem(bench_problem *p, int rounds, double tol);
+
+extern void problem_alloc(bench_problem *p);
+extern void problem_free(bench_problem *p);
+extern void problem_zero(bench_problem *p);
+extern void problem_destroy(bench_problem *p);
+
+extern int power_of_two(int n);
+extern int log_2(int n);
+
+
+#define CASSIGN(out, in) (c_re(out) = c_re(in), c_im(out) = c_im(in))
+
+bench_tensor *verify_pack(const bench_tensor *sz, int s);
+
+typedef struct {
+     double l;
+     double i;
+     double s;
+} errors;
+
+void verify_dft(bench_problem *p, int rounds, double tol, errors *e);
+void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e);
+void verify_r2r(bench_problem *p, int rounds, double tol, errors *e);
+
+/**************************************************************/
+/* routines to override */
+
+extern void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii);
+extern void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io);
+extern void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii);
+extern void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io);
+extern void after_problem_rcopy_from(bench_problem *p, bench_real *ri);
+extern void after_problem_rcopy_to(bench_problem *p, bench_real *ro);
+extern void bench_exit(int status);
+extern double bench_cost_postprocess(double cost);
+
+/**************************************************************
+ * malloc
+ **************************************************************/
+extern void *bench_malloc(size_t size);
+extern void bench_free(void *ptr);
+extern void bench_free0(void *ptr);
+
+/**************************************************************
+ * alloca
+ **************************************************************/
+#ifdef HAVE_ALLOCA_H
+#include <alloca.h>
+#endif
+
+/**************************************************************
+ * assert
+ **************************************************************/
+extern void bench_assertion_failed(const char *s, int line, const char *file);
+#define BENCH_ASSERT(ex)						 \
+      (void)((ex) || (bench_assertion_failed(#ex, __LINE__, __FILE__), 0))
+
+#define UNUSED(x) (void)x
+
+/***************************************
+ * Documentation strings
+ ***************************************/
+struct bench_doc {
+     const char *key;
+     const char *val;
+     const char *(*f)(void);
+};
+
+extern struct bench_doc bench_doc[];
+
+#ifdef CC
+#define CC_DOC BENCH_DOC("cc", CC)
+#elif defined(BENCH_CC)
+#define CC_DOC BENCH_DOC("cc", BENCH_CC)
+#else
+#define CC_DOC /* none */
+#endif
+
+#ifdef CXX
+#define CXX_DOC BENCH_DOC("cxx", CXX)
+#elif defined(BENCH_CXX)
+#define CXX_DOC BENCH_DOC("cxx", BENCH_CXX)
+#else
+#define CXX_DOC /* none */
+#endif
+
+#ifdef F77
+#define F77_DOC BENCH_DOC("f77", F77)
+#elif defined(BENCH_F77)
+#define F77_DOC BENCH_DOC("f77", BENCH_F77)
+#else
+#define F77_DOC /* none */
+#endif
+
+#ifdef F90
+#define F90_DOC BENCH_DOC("f90", F90)
+#elif defined(BENCH_F90)
+#define F90_DOC BENCH_DOC("f90", BENCH_F90)
+#else
+#define F90_DOC /* none */
+#endif
+
+#define BEGIN_BENCH_DOC						\
+struct bench_doc bench_doc[] = {				\
+    CC_DOC							\
+    CXX_DOC							\
+    F77_DOC							\
+    F90_DOC
+
+#define BENCH_DOC(key, val) { key, val, 0 },
+#define BENCH_DOCF(key, f) { key, 0, f },
+
+#define END_BENCH_DOC				\
+     {0, 0, 0}};
+
+#ifdef __cplusplus
+}                               /* extern "C" */
+#endif                          /* __cplusplus */
+    
+#endif /* __BENCH_USER_H__ */
--- a/fftw-3.3.10/libbench2/bench.h
+++ b/fftw-3.3.10/libbench2/bench.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+/* benchmark program definitions */
+#include "libbench2/bench-user.h"
+
+extern double time_min;
+extern int time_repeat;
+
+extern void timer_init(double tmin, int repeat);
+
+/* report functions */
+extern void (*report)(const bench_problem *p, double *t, int st);
+
+void report_mflops(const bench_problem *p, double *t, int st);
+void report_time(const bench_problem *p, double *t, int st);
+void report_benchmark(const bench_problem *p, double *t, int st);
+void report_verbose(const bench_problem *p, double *t, int st);
+
+void report_can_do(const char *param);
+void report_info(const char *param);
+void report_info_all(void);
+
+extern int aligned_main(int argc, char *argv[]);
+extern int bench_main(int argc, char *argv[]);
+
+extern void speed(const char *param, int setup_only);
+extern void accuracy(const char *param, int rounds, int impulse_rounds);
+
+extern double mflops(const bench_problem *p, double t);
+
+extern double bench_drand(void);
+extern void bench_srand(int seed);
+
+extern bench_problem *problem_parse(const char *desc);
+
+extern void ovtpvt(const char *format, ...);
+extern void ovtpvt_err(const char *format, ...);
+
+extern void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
+			int sign, double err[6]);
+extern void fftaccuracy_done(void);
+
+extern void caset(bench_complex *A, int n, bench_complex x);
+extern void aset(bench_real *A, int n, bench_real x);
--- a/fftw-3.3.10/libbench2/can-do.c
+++ b/fftw-3.3.10/libbench2/can-do.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+#include <stdio.h>
+
+void report_can_do(const char *param)
+{
+     bench_problem *p;
+     p = problem_parse(param);
+     ovtpvt("#%c\n", can_do(p) ? 't' : 'f');
+     problem_destroy(p);
+}
--- a/fftw-3.3.10/libbench2/caset.c
+++ b/fftw-3.3.10/libbench2/caset.c
@@ -0,0 +1,12 @@
+/* not worth copyrighting */
+
+#include "libbench2/bench.h"
+
+void caset(bench_complex *A, int n, bench_complex x)
+{
+     int i;
+     for (i = 0; i < n; ++i) {
+	  c_re(A[i]) = c_re(x);
+	  c_im(A[i]) = c_im(x);
+     }
+}
--- a/fftw-3.3.10/libbench2/dotens2.c
+++ b/fftw-3.3.10/libbench2/dotens2.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+
+static void recur(int rnk, const bench_iodim *dims0, const bench_iodim *dims1,
+		  dotens2_closure *k, 
+		  int indx0, int ondx0, int indx1, int ondx1)
+{
+     if (rnk == 0)
+          k->apply(k, indx0, ondx0, indx1, ondx1);
+     else {
+          int i, n = dims0[0].n;
+          int is0 = dims0[0].is;
+          int os0 = dims0[0].os;
+          int is1 = dims1[0].is;
+          int os1 = dims1[0].os;
+
+	  BENCH_ASSERT(n == dims1[0].n);
+
+          for (i = 0; i < n; ++i) {
+               recur(rnk - 1, dims0 + 1, dims1 + 1, k,
+		     indx0, ondx0, indx1, ondx1);
+	       indx0 += is0; ondx0 += os0;
+	       indx1 += is1; ondx1 += os1;
+	  }
+     }
+}
+
+void bench_dotens2(const bench_tensor *sz0, const bench_tensor *sz1, dotens2_closure *k)
+{
+     BENCH_ASSERT(sz0->rnk == sz1->rnk);
+     if (sz0->rnk == BENCH_RNK_MINFTY)
+          return;
+     recur(sz0->rnk, sz0->dims, sz1->dims, k, 0, 0, 0, 0);
+}
--- a/fftw-3.3.10/libbench2/info.c
+++ b/fftw-3.3.10/libbench2/info.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+#include <stdio.h>
+#include <string.h>
+
+void report_info(const char *param)
+{
+     struct bench_doc *p;
+
+     for (p = bench_doc; p->key; ++p) {
+	  if (!strcmp(param, p->key)) {
+	       if (!p->val)
+		    p->val = p->f();
+
+	       ovtpvt("%s\n", p->val);
+	  }
+     }
+}
+
+void report_info_all(void)
+{
+     struct bench_doc *p;
+
+     /*
+      * TODO: escape quotes?  The format is not unambigously
+      * parseable if the info string contains double quotes.
+      */
+     for (p = bench_doc; p->key; ++p) {
+	  if (!p->val)
+	       p->val = p->f();
+	  ovtpvt("(%s \"%s\")\n", p->key, p->val);
+     }
+     ovtpvt("(benchmark-precision \"%s\")\n", 
+	    SINGLE_PRECISION ? "single" : 
+	    (LDOUBLE_PRECISION ? "long-double" : 
+	     (QUAD_PRECISION ? "quad" : "double")));
+}
+
--- a/fftw-3.3.10/libbench2/main.c
+++ b/fftw-3.3.10/libbench2/main.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+
+/* On some systems, we are required to define a dummy main-like
+   routine (called "MAIN__" or something similar in order to link a C
+   main() with the Fortran libraries).  This is detected by autoconf;
+   see the autoconf 2.52 or later manual. */
+#ifdef F77_DUMMY_MAIN
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+     int F77_DUMMY_MAIN() { return 1; }
+#endif
+
+/* in a separate file so that the user can override it */
+int main(int argc, char *argv[])
+{
+     return bench_main(argc, argv);
+}
--- a/fftw-3.3.10/libbench2/mflops.c
+++ b/fftw-3.3.10/libbench2/mflops.c
@@ -0,0 +1,32 @@
+/* not worth copyrighting */
+
+#include "libbench2/bench.h"
+#include <math.h>
+
+double mflops(const bench_problem *p, double t)
+{
+     size_t size = tensor_sz(p->sz);
+     size_t vsize = tensor_sz(p->vecsz);
+
+     if (size <= 1) /* a copy: just return reals copied / time */
+	  switch (p->kind) {
+	      case PROBLEM_COMPLEX:
+		   return (2.0 * size * vsize / (t * 1.0e6));
+	      case PROBLEM_REAL:
+	      case PROBLEM_R2R:
+		   return (1.0 * size * vsize / (t * 1.0e6));
+	  }
+
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX:
+	      return (5.0 * size * vsize * log((double)size) / 
+		      (log(2.0) * t * 1.0e6));
+	 case PROBLEM_REAL:
+	 case PROBLEM_R2R:
+	      return (2.5 * vsize * size * log((double) size) / 
+		      (log(2.0) * t * 1.0e6));
+     }
+     BENCH_ASSERT(0 /* can't happen */);
+     return 0.0;
+}
+
--- a/fftw-3.3.10/libbench2/mp.c
+++ b/fftw-3.3.10/libbench2/mp.c
@@ -0,0 +1,641 @@
+#include "config.h"
+#include "libbench2/bench.h"
+#include <math.h>
+
+#define DG unsigned short
+#define ACC unsigned long
+#define REAL bench_real
+#define BITS_IN_REAL 53 /* mantissa */
+
+#define SHFT 16
+#define RADIX 65536L
+#define IRADIX (1.0 / RADIX)
+#define LO(x) ((x) & (RADIX - 1))
+#define HI(x) ((x) >> SHFT)
+#define HI_SIGNED(x) \
+   ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1))
+#define ZEROEXP (-32768)
+
+#define LEN 10
+
+typedef struct {
+     short sign;
+     short expt;
+     DG d[LEN]; 
+} N[1];
+
+#define EXA a->expt
+#define EXB b->expt
+#define EXC c->expt
+
+#define AD a->d
+#define BD b->d
+
+#define SGNA a->sign
+#define SGNB b->sign
+
+static const N zero = {{ 1, ZEROEXP, {0} }};
+
+static void cpy(const N a, N b)
+{
+     *b = *a;
+}
+
+static void fromreal(REAL x, N a)
+{
+     int i, e;
+
+     cpy(zero, a);
+     if (x == 0.0) return;
+     
+     if (x >= 0) { SGNA = 1; }
+     else       { SGNA = -1; x = -x; }
+
+     e = 0;
+     while (x >= 1.0) { x *= IRADIX; ++e; }
+     while (x < IRADIX) { x *= RADIX; --e; }
+     EXA = e;
+     
+     for (i = LEN - 1; i >= 0 && x != 0.0; --i) {
+	  REAL y;
+
+	  x *= RADIX;
+	  y = (REAL) ((int) x);
+	  AD[i] = (DG)y;
+	  x -= y;
+     }
+}
+
+static void fromshort(int x, N a)
+{
+     cpy(zero, a);
+
+     if (x < 0) { x = -x; SGNA = -1; } 
+     else { SGNA = 1; }
+     EXA = 1;
+     AD[LEN - 1] = x;
+}
+
+static void pack(DG *d, int e, int s, int l, N a)
+{
+     int i, j;
+
+     for (i = l - 1; i >= 0; --i, --e) 
+	  if (d[i] != 0) 
+	       break;
+
+     if (i < 0) {
+	  /* number is zero */
+	  cpy(zero, a);
+     } else {
+	  EXA = e;
+	  SGNA = s;
+
+	  if (i >= LEN - 1) {
+	       for (j = LEN - 1; j >= 0; --i, --j)
+		    AD[j] = d[i];
+	  } else {
+	       for (j = LEN - 1; i >= 0; --i, --j)
+		    AD[j] = d[i];
+	       for ( ; j >= 0; --j)
+		    AD[j] = 0;
+	  }
+     }
+}
+
+
+/* compare absolute values */
+static int abscmp(const N a, const N b)
+{
+     int i;
+     if (EXA > EXB) return 1;
+     if (EXA < EXB) return -1;
+     for (i = LEN - 1; i >= 0; --i) {
+	  if (AD[i] > BD[i])
+	       return 1;
+	  if (AD[i] < BD[i])
+	       return -1;
+     }
+     return 0;
+}
+
+static int eq(const N a, const N b)
+{
+     return (SGNA == SGNB) && (abscmp(a, b) == 0);
+}
+
+/* add magnitudes, for |a| >= |b| */
+static void addmag0(int s, const N a, const N b, N c)
+{
+     int ia, ib;
+     ACC r = 0;
+     DG d[LEN + 1];
+
+     for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
+	  r += (ACC)AD[ia] + (ACC)BD[ib];
+	  d[ia] = LO(r);
+	  r = HI(r);
+     }     
+     for (; ia < LEN; ++ia) {
+	  r += (ACC)AD[ia];
+	  d[ia] = LO(r);
+	  r = HI(r);
+     }
+     d[ia] = LO(r);
+     pack(d, EXA + 1, s * SGNA, LEN + 1, c);
+}
+
+static void addmag(int s, const N a, const N b, N c)
+{
+     if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c);
+}
+
+/* subtract magnitudes, for |a| >= |b| */
+static void submag0(int s, const N a, const N b, N c)
+{
+     int ia, ib;
+     ACC r = 0;
+     DG d[LEN];
+
+     for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
+	  r += (ACC)AD[ia] - (ACC)BD[ib];
+	  d[ia] = LO(r);
+	  r = HI_SIGNED(r);
+     }     
+     for (; ia < LEN; ++ia) {
+	  r += (ACC)AD[ia];
+	  d[ia] = LO(r);
+	  r = HI_SIGNED(r);
+     }
+
+     pack(d, EXA, s * SGNA, LEN, c);
+}
+
+static void submag(int s, const N a, const N b, N c)
+{
+     if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c);
+}
+
+/* c = a + b */
+static void add(const N a, const N b, N c)
+{
+     if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c);
+}
+
+static void sub(const N a, const N b, N c)
+{
+     if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c);
+}
+
+static void mul(const N a, const N b, N c)
+{
+     DG d[2 * LEN];
+     int i, j, k;
+     ACC r;
+
+     for (i = 0; i < LEN; ++i)
+	  d[2 * i] = d[2 * i + 1] = 0;
+
+     for (i = 0; i < LEN; ++i) {
+	  ACC ai = AD[i];
+	  if (ai) {
+	       r = 0;
+	       for (j = 0, k = i; j < LEN; ++j, ++k) {
+		    r += ai * (ACC)BD[j] + (ACC)d[k];
+		    d[k] = LO(r);
+		    r = HI(r);
+	       }
+	       d[k] = LO(r);
+	  }
+     }
+
+     pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c);
+}
+
+static REAL toreal(const N a)
+{
+     REAL h, l, f;
+     int i, bits;
+     ACC r;
+     DG sticky;
+
+     if (EXA != ZEROEXP) {
+	  f = IRADIX;
+	  i = LEN;
+
+	  bits = 0;
+	  h = (r = AD[--i]) * f; f *= IRADIX;
+	  for (bits = 0; r > 0; ++bits)
+	       r >>= 1;
+
+	  /* first digit */
+	  while (bits + SHFT <= BITS_IN_REAL) {
+	       h += AD[--i] * f;  f *= IRADIX; bits += SHFT;
+	  }
+
+	  /* guard digit (leave one bit for sticky bit, hence `<' instead
+	     of `<=') */
+	  bits = 0; l = 0.0;
+	  while (bits + SHFT < BITS_IN_REAL) {
+	       l += AD[--i] * f;  f *= IRADIX; bits += SHFT;
+	  }
+	  
+	  /* sticky bit */
+	  sticky = 0;
+	  while (i > 0) 
+	       sticky |= AD[--i];
+
+	  if (sticky)
+	       l += (RADIX / 2) * f;
+
+	  h += l;
+
+	  for (i = 0; i < EXA; ++i) h *= (REAL)RADIX;
+	  for (i = 0; i > EXA; --i) h *= IRADIX;
+	  if (SGNA == -1) h = -h;
+	  return h;
+     } else {
+	  return 0.0;
+     }
+}
+
+static void neg(N a)
+{
+     SGNA = -SGNA;
+}
+
+static void inv(const N a, N x)
+{
+     N w, z, one, two;
+
+     fromreal(1.0 / toreal(a), x); /* initial guess */
+     fromshort(1, one);
+     fromshort(2, two);
+
+     for (;;) {
+	  /* Newton */
+	  mul(a, x, w);
+	  sub(two, w, z);
+	  if (eq(one, z)) break;
+	  mul(x, z, x);
+     }
+}
+
+
+/* 2 pi */
+static const N n2pi = {{
+     1, 1,
+     {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6}
+}};
+
+/* 1 / 31! */
+static const N i31fac = {{ 
+     1, -7, 
+     {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382}
+}};
+
+
+/* 1 / 32! */
+static const N i32fac = {{
+     1, -7,
+     {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293}
+}};
+
+static void msin(const N a, N b)
+{
+     N a2, g, k;
+     int i;
+
+     cpy(i31fac, g);
+     cpy(g, b);
+     mul(a, a, a2);
+
+     /* Taylor */
+     for (i = 31; i > 1; i -= 2) {
+	  fromshort(i * (i - 1), k);
+	  mul(k, g, g);
+	  mul(a2, b, k);
+	  sub(g, k, b);
+     }
+     mul(a, b, b);
+}
+
+static void mcos(const N a, N b)
+{
+     N a2, g, k;
+     int i;
+
+     cpy(i32fac, g);
+     cpy(g, b);
+     mul(a, a, a2);
+
+     /* Taylor */
+     for (i = 32; i > 0; i -= 2) {
+	  fromshort(i * (i - 1), k);
+	  mul(k, g, g);
+	  mul(a2, b, k);
+	  sub(g, k, b);
+     }
+}
+
+static void by2pi(REAL m, REAL n, N a)
+{
+     N b;
+
+     fromreal(n, b);
+     inv(b, a);
+     fromreal(m, b);
+     mul(a, b, a);
+     mul(n2pi, a, a);
+}
+
+static void sin2pi(REAL m, REAL n, N a);
+static void cos2pi(REAL m, REAL n, N a)
+{
+     N b;
+     if (m < 0) cos2pi(-m, n, a);
+     else if (m > n * 0.5) cos2pi(n - m, n, a);
+     else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);}
+     else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a);
+     else { by2pi(m, n, b); mcos(b, a); }
+}
+
+static void sin2pi(REAL m, REAL n, N a)
+{
+     N b;
+     if (m < 0)  {sin2pi(-m, n, a); neg(a);}
+     else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);}
+     else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);}
+     else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);}
+     else {by2pi(m, n, b); msin(b, a);}
+}
+
+/*----------------------------------------------------------------------*/
+/* FFT stuff */
+
+/* (r0 + i i0)(r1 + i i1) */
+static void cmul(N r0, N i0, N r1, N i1, N r2, N i2)
+{
+     N s, t, q;
+     mul(r0, r1, s);
+     mul(i0, i1, t);
+     sub(s, t, q);
+     mul(r0, i1, s);
+     mul(i0, r1, t);
+     add(s, t, i2);
+     cpy(q, r2);
+}
+
+/* (r0 - i i0)(r1 + i i1) */
+static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2)
+{
+     N s, t, q;
+     mul(r0, r1, s);
+     mul(i0, i1, t);
+     add(s, t, q);
+     mul(r0, i1, s);
+     mul(i0, r1, t);
+     sub(s, t, i2);
+     cpy(q, r2);
+}
+
+static void mcexp(int m, int n, N r, N i)
+{
+     static int cached_n = -1;
+     static N w[64][2];
+     int k, j;
+     if (n != cached_n) {
+	  for (j = 1, k = 0; j < n; j += j, ++k) {
+	       cos2pi(j, n, w[k][0]);
+	       sin2pi(j, n, w[k][1]);
+	  }
+	  cached_n = n;
+     }
+
+     fromshort(1, r);
+     fromshort(0, i);
+     if (m > 0) {
+	  for (k = 0; m; ++k, m >>= 1) 
+	       if (m & 1)
+		    cmul(w[k][0], w[k][1], r, i, r, i);
+     } else {
+	  m = -m;
+	  for (k = 0; m; ++k, m >>= 1) 
+	       if (m & 1)
+		    cmulj(w[k][0], w[k][1], r, i, r, i);
+     }
+}
+
+static void bitrev(int n, N *a)
+{
+     int i, j, m;
+     for (i = j = 0; i < n - 1; ++i) {
+	  if (i < j) {
+	       N t;
+	       cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]);
+	       cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]);
+	  }
+
+	  /* bit reversed counter */
+	  m = n; do { m >>= 1; j ^= m; } while (!(j & m));
+     }
+}
+
+static void fft0(int n, N *a, int sign)
+{
+     int i, j, k;
+
+     bitrev(n, a);
+     for (i = 1; i < n; i = 2 * i) {
+	  for (j = 0; j < i; ++j) {
+	       N wr, wi;
+	       mcexp(sign * (int)j, 2 * i, wr, wi);
+	       for (k = j; k < n; k += 2 * i) {
+		    N *a0 = a + 2 * k;
+		    N *a1 = a0 + 2 * i;
+		    N r0, i0, r1, i1, t0, t1, xr, xi;
+		    cpy(a0[0], r0); cpy(a0[1], i0);
+		    cpy(a1[0], r1); cpy(a1[1], i1);
+		    mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr);
+		    mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi);
+		    add(r0, xr, a0[0]);  add(i0, xi, a0[1]);
+		    sub(r0, xr, a1[0]);  sub(i0, xi, a1[1]);
+	       }
+	  }
+     }
+}
+
+/* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */
+static void bluestein_sequence(int n, N *a)
+{
+     int k, ksq, n2 = 2 * n;
+
+     ksq = 1; /* (-1)^2 */
+     for (k = 0; k < n; ++k) {
+	  /* careful with overflow */
+	  ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2;
+	  mcexp(ksq, n2, a[2*k], a[2*k+1]);
+     }
+}
+
+static int pow2_atleast(int x)
+{
+     int h;
+     for (h = 1; h < x; h = 2 * h)
+	  ;
+     return h;
+}
+
+static N *cached_bluestein_w = 0;
+static N *cached_bluestein_y = 0;
+static int cached_bluestein_n = -1;
+
+static void bluestein(int n, N *a)
+{
+     int nb = pow2_atleast(2 * n);
+     N *b = (N *)bench_malloc(2 * nb * sizeof(N));
+     N *w = cached_bluestein_w;
+     N *y = cached_bluestein_y;
+     N nbinv;
+     int i;
+
+     fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */
+
+     if (cached_bluestein_n != n) {
+	  if (w) bench_free(w);
+	  if (y) bench_free(y);
+	  w = (N *)bench_malloc(2 * n * sizeof(N));
+	  y = (N *)bench_malloc(2 * nb * sizeof(N));
+	  cached_bluestein_n = n;
+	  cached_bluestein_w = w;
+	  cached_bluestein_y = y;
+
+	  bluestein_sequence(n, w);
+	  for (i = 0; i < 2*nb; ++i)  cpy(zero, y[i]);
+
+	  for (i = 0; i < n; ++i) {
+	       cpy(w[2*i], y[2*i]);
+	       cpy(w[2*i+1], y[2*i+1]);
+	  }
+	  for (i = 1; i < n; ++i) {
+	       cpy(w[2*i], y[2*(nb-i)]);
+	       cpy(w[2*i+1], y[2*(nb-i)+1]);
+	  }
+
+	  fft0(nb, y, -1);
+     }
+
+     for (i = 0; i < 2*nb; ++i)  cpy(zero, b[i]);
+     
+     for (i = 0; i < n; ++i) 
+	  cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]);
+
+     /* scaled convolution b * y */
+     fft0(nb, b, -1);
+
+     for (i = 0; i < nb; ++i) 
+	  cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]);
+     fft0(nb, b, 1);
+
+     for (i = 0; i < n; ++i) {
+	  cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]);
+	  mul(nbinv, a[2*i], a[2*i]);
+	  mul(nbinv, a[2*i+1], a[2*i+1]);
+     }
+
+     bench_free(b);
+}
+
+static void swapri(int n, N *a)
+{
+     int i;
+     for (i = 0; i < n; ++i) {
+	  N t;
+	  cpy(a[2 * i], t);
+	  cpy(a[2 * i + 1], a[2 * i]);
+	  cpy(t, a[2 * i + 1]);
+     }
+}
+
+static void fft1(int n, N *a, int sign)
+{
+     if (power_of_two(n)) {
+	  fft0(n, a, sign);
+     } else {
+	  if (sign == 1) swapri(n, a);
+	  bluestein(n, a);
+	  if (sign == 1) swapri(n, a);
+     }
+}
+
+static void fromrealv(int n, bench_complex *a, N *b)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  fromreal(c_re(a[i]), b[2 * i]);
+	  fromreal(c_im(a[i]), b[2 * i + 1]);
+     }
+}
+
+static void compare(int n, N *a, N *b, double *err)
+{
+     int i;
+     double e1, e2, einf;
+     double n1, n2, ninf;
+
+     e1 = e2 = einf = 0.0;
+     n1 = n2 = ninf = 0.0;
+
+#    define DO(x1, x2, xinf, var) { 			\
+     double d = var;					\
+     if (d < 0) d = -d;					\
+     x1 += d; x2 += d * d; if (d > xinf) xinf = d;	\
+}
+	  
+     for (i = 0; i < 2 * n; ++i) {
+	  N dd;
+	  sub(a[i], b[i], dd);
+	  DO(n1, n2, ninf, toreal(a[i]));
+	  DO(e1, e2, einf, toreal(dd));
+     }
+
+#    undef DO
+     err[0] = e1 / n1;
+     err[1] = sqrt(e2 / n2);
+     err[2] = einf / ninf;
+}
+
+void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
+		 int sign, double err[6])
+{
+     N *b = (N *)bench_malloc(2 * n * sizeof(N));
+     N *fftb = (N *)bench_malloc(2 * n * sizeof(N));
+     N mn, ninv;
+     int i;
+
+     fromreal(n, mn); inv(mn, ninv);
+
+     /* forward error */
+     fromrealv(n, a, b); fromrealv(n, ffta, fftb);
+     fft1(n, b, sign);
+     compare(n, b, fftb, err);
+
+     /* backward error */
+     fromrealv(n, a, b); fromrealv(n, ffta, fftb);
+     for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]);
+     fft1(n, fftb, -sign);
+     compare(n, b, fftb, err + 3);
+
+     bench_free(fftb);
+     bench_free(b);
+}
+
+void fftaccuracy_done(void)
+{
+     if (cached_bluestein_w) bench_free(cached_bluestein_w);
+     if (cached_bluestein_y) bench_free(cached_bluestein_y);
+     cached_bluestein_w = 0;
+     cached_bluestein_y = 0;
+     cached_bluestein_n = -1;
+}
--- a/fftw-3.3.10/libbench2/my-getopt.c
+++ b/fftw-3.3.10/libbench2/my-getopt.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <string.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "my-getopt.h"
+
+int my_optind = 1;
+const char *my_optarg = 0;
+static const char *scan_pointer = 0;
+
+void my_usage(const char *progname, const struct my_option *opt)
+{
+    int i;
+    size_t col = 0;
+
+    fprintf(stdout, "Usage: %s", progname);
+    col += (strlen(progname) + 7);
+    for (i = 0; opt[i].long_name; i++) {
+	size_t option_len;
+
+	option_len = strlen(opt[i].long_name);
+	if (col >= 80 - (option_len + 16)) {
+	    fputs("\n\t", stdout);
+	    col = 8;
+	}
+	fprintf(stdout, " [--%s", opt[i].long_name);
+	col += (option_len + 4);
+	if (opt[i].short_name < 128) {
+	    fprintf(stdout, " | -%c", opt[i].short_name);
+	    col += 5;
+	}
+	switch (opt[i].argtype) {
+	    case REQARG:
+		 fputs(" arg]", stdout);
+		 col += 5;
+		 break;
+	    case OPTARG:
+		 fputs(" [arg]]", stdout);
+		 col += 10;
+		 break;
+	    default:
+		 fputs("]", stdout);
+		 col++;
+	}
+    }
+
+    fputs ("\n", stdout);
+}
+
+int my_getopt(int argc, char *argv[], const struct my_option *optarray)
+{
+     const char *p;
+     const struct my_option *l;
+
+     if (scan_pointer && *scan_pointer) {
+	  /* continue a previously scanned argv[] element */
+	  p = scan_pointer;
+	  goto short_option;
+     } else {
+	  /* new argv[] element */
+	  if (my_optind >= argc)
+	       return -1; /* no more options */
+
+	  p = argv[my_optind];
+     
+	  if (*p++ != '-')  
+	       return (-1); /* not an option */
+
+	  if (!*p) 
+	       return (-1); /* string is exactly '-' */
+	       
+	  ++my_optind;
+     }
+
+     if (*p == '-') {
+	  /* long option */
+	  scan_pointer = 0;
+	  my_optarg = 0;
+
+	  ++p;
+	  
+	  for (l = optarray; l->short_name; ++l) {
+	       size_t len = strlen(l->long_name);
+	       if (!strncmp(l->long_name, p, len) && 
+		   (!p[len] || p[len] == '=')) {
+		    switch (l->argtype) {
+			case NOARG: 
+			     goto ok;
+			case OPTARG: 
+			     if (p[len] == '=')
+				  my_optarg = p + len + 1;
+			     goto ok;
+			case REQARG: 
+			     if (p[len] == '=') {
+				  my_optarg = p + len + 1;
+				  goto ok;
+			     }
+			     if (my_optind >= argc) {
+				  fprintf(stderr, 
+					  "option --%s requires an argument\n",
+					  l->long_name);
+				  return '?';
+			     }
+			     my_optarg = argv[my_optind];
+			     ++my_optind;
+			     goto ok;
+		    }
+	       }
+	  }
+     } else {
+     short_option:
+	  scan_pointer = 0;
+	  my_optarg = 0;
+
+	  for (l = optarray; l->short_name; ++l) {
+	       if (l->short_name == (char)l->short_name &&
+		   *p == l->short_name) {
+		    ++p;
+		    switch (l->argtype) {
+			case NOARG: 
+			     scan_pointer = p;
+			     goto ok;
+			case OPTARG: 
+			     if (*p)
+				  my_optarg = p;
+			     goto ok;
+			case REQARG: 
+			     if (*p) {
+				  my_optarg = p;
+			     } else {
+				  if (my_optind >= argc) {
+				       fprintf(stderr, 
+					  "option -%c requires an argument\n",
+					  l->short_name);
+				       return '?';
+				  }
+				  my_optarg = argv[my_optind];
+				  ++my_optind;
+			     }
+			     goto ok;
+		    }
+	       }
+	  }
+     }
+
+     fprintf(stderr, "unrecognized option %s\n", argv[my_optind - 1]);
+     return '?';
+
+ ok:
+     return l->short_name;
+}
+
--- a/fftw-3.3.10/libbench2/my-getopt.h
+++ b/fftw-3.3.10/libbench2/my-getopt.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __MY_GETOPT_H__
+#define __MY_GETOPT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif                          /* __cplusplus */
+
+enum { REQARG, OPTARG, NOARG };
+
+struct my_option {
+     const char *long_name;
+     int argtype;
+     int short_name;
+};
+
+extern int my_optind;
+extern const char *my_optarg;
+
+extern void my_usage(const char *progname, const struct my_option *opt);
+extern int my_getopt(int argc, char *argv[], const struct my_option *optarray);
+
+#ifdef __cplusplus
+}                               /* extern "C" */
+#endif                          /* __cplusplus */
+
+#endif /* __MY_GETOPT_H__ */
--- a/fftw-3.3.10/libbench2/ovtpvt.c
+++ b/fftw-3.3.10/libbench2/ovtpvt.c
@@ -0,0 +1,28 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "libbench2/bench.h"
+
+void ovtpvt(const char *format, ...)
+{
+     va_list ap;
+     
+     va_start(ap, format);
+     if (verbose >= 0)
+	  vfprintf(stdout, format, ap);
+     va_end(ap);
+     fflush(stdout);
+}
+
+void ovtpvt_err(const char *format, ...)
+{
+     va_list ap;
+     
+     va_start(ap, format);
+     if (verbose >= 0) {
+	  fflush(stdout);
+	  vfprintf(stderr, format, ap);
+     }
+     va_end(ap);
+     fflush(stdout);
+}
--- a/fftw-3.3.10/libbench2/pow2.c
+++ b/fftw-3.3.10/libbench2/pow2.c
@@ -0,0 +1,6 @@
+#include "libbench2/bench.h"
+
+int power_of_two(int n)
+{
+     return (((n) > 0) && (((n) & ((n) - 1)) == 0));
+}
--- a/fftw-3.3.10/libbench2/problem.c
+++ b/fftw-3.3.10/libbench2/problem.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "config.h"
+#include "libbench2/bench.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+int always_pad_real = 0; /* by default, only pad in-place case */
+
+typedef enum {
+     SAME, PADDED, HALFISH
+} n_transform;
+
+/* funny transformations for last dimension of PROBLEM_REAL */
+static int transform_n(int n, n_transform nt)
+{
+     switch (nt) {
+	 case SAME: return n;
+	 case PADDED: return 2*(n/2+1);
+	 case HALFISH: return (n/2+1);
+	 default: BENCH_ASSERT(0); return 0;
+     }
+}
+
+/* do what I mean */
+static bench_tensor *dwim(bench_tensor *t, bench_iodim **last_iodim,
+			  n_transform nti, n_transform nto,
+			  bench_iodim *dt)
+{
+     int i;
+     bench_iodim *d, *d1;
+
+     if (!BENCH_FINITE_RNK(t->rnk) || t->rnk < 1)
+	  return t;
+
+     i = t->rnk;
+     d1 = *last_iodim;
+
+     while (--i >= 0) {
+	  d = t->dims + i;
+	  if (!d->is) 
+	       d->is = d1->is * transform_n(d1->n, d1==dt ? nti : SAME); 
+	  if (!d->os) 
+	       d->os = d1->os * transform_n(d1->n, d1==dt ? nto : SAME); 
+	  d1 = d;
+     }
+
+     *last_iodim = d1;
+     return t;
+}
+
+static void transpose_tensor(bench_tensor *t)
+{
+     if (!BENCH_FINITE_RNK(t->rnk) || t->rnk < 2)
+          return;
+
+     t->dims[0].os = t->dims[1].os;
+     t->dims[1].os = t->dims[0].os * t->dims[0].n;
+}
+
+static const char *parseint(const char *s, int *n)
+{
+     int sign = 1;
+
+     *n = 0;
+
+     if (*s == '-') { 
+	  sign = -1;
+	  ++s;
+     } else if (*s == '+') { 
+	  sign = +1; 
+	  ++s; 
+     }
+
+     BENCH_ASSERT(isdigit(*s));
+     while (isdigit(*s)) {
+	  *n = *n * 10 + (*s - '0');
+	  ++s;
+     }
+     
+     *n *= sign;
+
+     if (*s == 'k' || *s == 'K') {
+	  *n *= 1024;
+	  ++s;
+     }
+
+     if (*s == 'm' || *s == 'M') {
+	  *n *= 1024 * 1024;
+	  ++s;
+     }
+
+     return s;
+}
+
+struct dimlist { bench_iodim car; r2r_kind_t k; struct dimlist *cdr; };
+
+static const char *parsetensor(const char *s, bench_tensor **tp,
+			       r2r_kind_t **k)
+{
+     struct dimlist *l = 0, *m;
+     bench_tensor *t;
+     int rnk = 0;
+
+ L1:
+     m = (struct dimlist *)bench_malloc(sizeof(struct dimlist));
+     /* nconc onto l */
+     m->cdr = l; l = m;
+     ++rnk; 
+
+     s = parseint(s, &m->car.n);
+
+     if (*s == ':') {
+	  /* read input stride */
+	  ++s;
+	  s = parseint(s, &m->car.is);
+	  if (*s == ':') {
+	       /* read output stride */
+	       ++s;
+	       s = parseint(s, &m->car.os);
+	  } else {
+	       /* default */
+	       m->car.os = m->car.is;
+	  }
+     } else {
+	  m->car.is = 0;
+	  m->car.os = 0;
+     }
+
+     if (*s == 'f' || *s == 'F') {
+	  m->k = R2R_R2HC;
+	  ++s;
+     }
+     else if (*s == 'b' || *s == 'B') {
+	  m->k = R2R_HC2R;
+	  ++s;
+     }
+     else if (*s == 'h' || *s == 'H') {
+	  m->k = R2R_DHT;
+	  ++s;
+     }
+     else if (*s == 'e' || *s == 'E' || *s == 'o' || *s == 'O') {
+	  char c = *(s++);
+	  int ab;
+
+	  s = parseint(s, &ab);
+
+	  if (c == 'e' || c == 'E') {
+	       if (ab == 0)
+		    m->k = R2R_REDFT00;
+	       else if (ab == 1)
+		    m->k = R2R_REDFT01;
+	       else if (ab == 10)
+		    m->k = R2R_REDFT10;
+	       else if (ab == 11)
+		    m->k = R2R_REDFT11;
+	       else
+		    BENCH_ASSERT(0);
+	  }
+	  else {
+	       if (ab == 0)
+		    m->k = R2R_RODFT00;
+	       else if (ab == 1)
+		    m->k = R2R_RODFT01;
+	       else if (ab == 10)
+		    m->k = R2R_RODFT10;
+	       else if (ab == 11)
+		    m->k = R2R_RODFT11;
+	       else
+		    BENCH_ASSERT(0);
+	  }
+     }
+     else
+	  m->k = R2R_R2HC;
+
+     if (*s == 'x' || *s == 'X') {
+	  ++s;
+	  goto L1;
+     }
+     
+     /* now we have a dimlist.  Build bench_tensor, etc. */
+
+     if (k && rnk > 0) {
+	  int i;
+	  *k = (r2r_kind_t *) bench_malloc(sizeof(r2r_kind_t) * rnk);
+	  for (m = l, i = rnk - 1; i >= 0; --i, m = m->cdr) {
+	       BENCH_ASSERT(m);
+	       (*k)[i] = m->k;
+	  }
+     }
+
+     t = mktensor(rnk);
+     while (--rnk >= 0) {
+	  bench_iodim *d = t->dims + rnk;
+	  BENCH_ASSERT(l);
+	  m = l; l = m->cdr;
+	  d->n = m->car.n;
+	  d->is = m->car.is;
+	  d->os = m->car.os;
+	  bench_free(m);
+     }
+
+     *tp = t;
+     return s;
+}
+
+/* parse a problem description, return a problem */
+bench_problem *problem_parse(const char *s)
+{
+     bench_problem *p;
+     bench_iodim last_iodim0 = {1,1,1}, *last_iodim = &last_iodim0;
+     bench_iodim *sz_last_iodim;
+     bench_tensor *sz;
+     n_transform nti = SAME, nto = SAME;
+     int transpose = 0;
+
+     p = (bench_problem *) bench_malloc(sizeof(bench_problem));
+     p->kind = PROBLEM_COMPLEX;
+     p->k = 0;
+     p->sign = -1;
+     p->in = p->out = 0;
+     p->inphys = p->outphys = 0;
+     p->iphyssz = p->ophyssz = 0;
+     p->in_place = 0;
+     p->destroy_input = 0;
+     p->split = 0;
+     p->userinfo = 0;
+     p->scrambled_in = p->scrambled_out = 0;
+     p->sz = p->vecsz = 0;
+     p->ini = p->outi = 0;
+     p->pstring = (char *) bench_malloc(sizeof(char) * (strlen(s) + 1));
+     strcpy(p->pstring, s);
+
+ L1:
+     switch (tolower(*s)) {
+	 case 'i': p->in_place = 1; ++s; goto L1;
+	 case 'o': p->in_place = 0; ++s; goto L1;
+	 case 'd': p->destroy_input = 1; ++s; goto L1;
+	 case '/': p->split = 1; ++s; goto L1;
+	 case 'f': 
+	 case '-': p->sign = -1; ++s; goto L1;
+	 case 'b': 
+	 case '+': p->sign = 1; ++s; goto L1;
+	 case 'r': p->kind = PROBLEM_REAL; ++s; goto L1;
+	 case 'c': p->kind = PROBLEM_COMPLEX; ++s; goto L1;
+	 case 'k': p->kind = PROBLEM_R2R; ++s; goto L1;
+	 case 't': transpose = 1; ++s; goto L1;
+	      
+	 /* hack for MPI: */
+	 case '[': p->scrambled_in = 1; ++s; goto L1;
+	 case ']': p->scrambled_out = 1; ++s; goto L1;
+
+	 default : ;
+     }
+
+     s = parsetensor(s, &sz, p->kind == PROBLEM_R2R ? &p->k : 0);
+
+     if (p->kind == PROBLEM_REAL) {
+	  if (p->sign < 0) {
+	       nti = p->in_place || always_pad_real ? PADDED : SAME;
+	       nto = HALFISH;
+	  }
+	  else {
+	       nti = HALFISH;
+	       nto = p->in_place || always_pad_real ? PADDED : SAME;
+	  }
+     }
+
+     sz_last_iodim = sz->dims + sz->rnk - 1;
+     if (*s == '*') { /* "external" vector */
+	  ++s;
+	  p->sz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+	  s = parsetensor(s, &sz, 0);
+	  p->vecsz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+     } else if (*s == 'v' || *s == 'V') { /* "internal" vector */
+	  bench_tensor *vecsz;
+	  ++s;
+	  s = parsetensor(s, &vecsz, 0);
+	  p->vecsz = dwim(vecsz, &last_iodim, nti, nto, sz_last_iodim);
+	  p->sz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+     } else {
+	  p->sz = dwim(sz, &last_iodim, nti, nto, sz_last_iodim);
+	  p->vecsz = mktensor(0);
+     }
+
+     if (transpose) {
+	  transpose_tensor(p->sz);
+	  transpose_tensor(p->vecsz);
+     }
+
+     if (!p->in_place)
+	  p->out = ((bench_real *) p->in) + (1 << 20);  /* whatever */
+
+     BENCH_ASSERT(p->sz && p->vecsz);
+     BENCH_ASSERT(!*s);
+     return p;
+}
+
+void problem_destroy(bench_problem *p)
+{
+     BENCH_ASSERT(p);
+     problem_free(p);
+     bench_free0(p->k);
+     bench_free0(p->pstring);
+     bench_free(p);
+}
+
--- a/fftw-3.3.10/libbench2/report.c
+++ b/fftw-3.3.10/libbench2/report.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+void (*report)(const bench_problem *p, double *t, int st);
+
+#undef min
+#undef max /* you never know */
+
+struct stats {
+     double min;
+     double max;
+     double avg;
+     double median;
+};
+
+static void mkstat(double *t, int st, struct stats *a)
+{
+     int i, j;
+     
+     a->min = t[0];
+     a->max = t[0];
+     a->avg = 0.0;
+
+     for (i = 0; i < st; ++i) {
+	  if (t[i] < a->min)
+	       a->min = t[i];
+	  if (t[i] > a->max)
+	       a->max = t[i];
+	  a->avg += t[i];
+     }
+     a->avg /= (double)st;
+
+     /* compute median --- silly bubblesort algorithm */
+     for (i = st - 1; i > 1; --i) {
+	  for (j = 0; j < i - 1; ++j) {
+	       double t0, t1;
+	       if ((t0 = t[j]) > (t1 = t[j + 1])) {
+		    t[j] = t1;
+		    t[j + 1] = t0;
+	       }
+	  } 
+     }
+     a->median = t[st / 2];
+}
+
+void report_mflops(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     mkstat(t, st, &s);
+     ovtpvt("(%g %g %g %g)\n", 
+	    mflops(p, s.max), mflops(p, s.avg), 
+	    mflops(p, s.min), mflops(p, s.median));
+}
+
+void report_time(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     UNUSED(p);
+     mkstat(t, st, &s);
+     ovtpvt("(%g %g %g %g)\n", s.min, s.avg, s.max, s.median);
+}
+
+void report_benchmark(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     mkstat(t, st, &s);
+     ovtpvt("%.8g %.8g %g\n", mflops(p, s.min), s.min, p->setup_time);
+}
+
+static void sprintf_time(double x, char *buf, int buflen)
+{
+#ifdef HAVE_SNPRINTF
+#  define MY_SPRINTF(a, b) snprintf(buf, buflen, a, b)
+#else
+#  define MY_SPRINTF(a, b) sprintf(buf, a, b)
+#endif
+     if (x < 1.0E-6)
+	  MY_SPRINTF("%.2f ns", x * 1.0E9);
+     else if (x < 1.0E-3)
+	  MY_SPRINTF("%.2f us", x * 1.0E6);
+     else if (x < 1.0)
+	  MY_SPRINTF("%.2f ms", x * 1.0E3);
+     else
+	  MY_SPRINTF("%.2f s", x);
+#undef MY_SPRINTF
+}
+
+void report_verbose(const bench_problem *p, double *t, int st)
+{
+     struct stats s;
+     char bmin[64], bmax[64], bavg[64], bmedian[64], btmin[64];
+     char bsetup[64];
+     int copyp = tensor_sz(p->sz) == 1;
+
+     mkstat(t, st, &s);
+
+     sprintf_time(s.min, bmin, 64);
+     sprintf_time(s.max, bmax, 64);
+     sprintf_time(s.avg, bavg, 64);
+     sprintf_time(s.median, bmedian, 64);
+     sprintf_time(time_min, btmin, 64);
+     sprintf_time(p->setup_time, bsetup, 64);
+
+     ovtpvt("Problem: %s, setup: %s, time: %s, %s: %.8g\n",
+	    p->pstring, bsetup, bmin, 
+	    copyp ? "fp-move/us" : "``mflops''",
+	    mflops(p, s.min));
+
+     if (verbose) {
+	  ovtpvt("Took %d measurements for at least %s each.\n", st, btmin);
+	  ovtpvt("Time: min %s, max %s, avg %s, median %s\n",
+		 bmin, bmax, bavg, bmedian);
+     }
+}
--- a/fftw-3.3.10/libbench2/speed.c
+++ b/fftw-3.3.10/libbench2/speed.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+
+int no_speed_allocation = 0; /* 1 to not allocate array data in speed() */
+
+void speed(const char *param, int setup_only)
+{
+     double *t;
+     int iter = 0, k;
+     bench_problem *p;
+     double tmin, y;
+
+     t = (double *) bench_malloc(time_repeat * sizeof(double));
+
+     for (k = 0; k < time_repeat; ++k) 
+	  t[k] = 0;
+
+     p = problem_parse(param);
+     BENCH_ASSERT(can_do(p));
+     if (!no_speed_allocation) {
+	  problem_alloc(p);
+	  problem_zero(p);
+     }
+
+     timer_start(LIBBENCH_TIMER);
+     setup(p);
+     p->setup_time = bench_cost_postprocess(timer_stop(LIBBENCH_TIMER));
+
+     /* reset the input to zero again, because the planner in paranoid
+	mode sets it to random values, thus making the benchmark
+	diverge. */
+     if (!no_speed_allocation) 
+	  problem_zero(p);
+     
+     if (setup_only)
+	  goto done;
+
+ start_over:
+     for (iter = 1; iter < (1<<30); iter *= 2) {
+	  tmin = 1.0e20;
+	  for (k = 0; k < time_repeat; ++k) {
+	       timer_start(LIBBENCH_TIMER);
+	       doit(iter, p);
+	       y = bench_cost_postprocess(timer_stop(LIBBENCH_TIMER));
+	       if (y < 0) /* yes, it happens */
+		    goto start_over;
+	       t[k] = y;
+	       if (y < tmin)
+		    tmin = y;
+	  }
+	  
+	  if (tmin >= time_min)
+	       goto done;
+     }
+
+     goto start_over; /* this also happens */
+
+ done:
+     done(p);
+
+     if (iter) 
+	  for (k = 0; k < time_repeat; ++k) 
+	       t[k] /= iter;
+     else
+	  for (k = 0; k < time_repeat; ++k) 
+	       t[k] = 0;
+
+     report(p, t, time_repeat);
+
+     if (!no_speed_allocation)
+	  problem_destroy(p);
+     bench_free(t);
+     return;
+}
--- a/fftw-3.3.10/libbench2/tensor.c
+++ b/fftw-3.3.10/libbench2/tensor.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "libbench2/bench.h"
+#include <stdlib.h>
+
+bench_tensor *mktensor(int rnk) 
+{
+     bench_tensor *x;
+
+     BENCH_ASSERT(rnk >= 0);
+
+     x = (bench_tensor *)bench_malloc(sizeof(bench_tensor));
+     if (BENCH_FINITE_RNK(rnk) && rnk > 0)
+          x->dims = (bench_iodim *)bench_malloc(sizeof(bench_iodim) * rnk);
+     else
+          x->dims = 0;
+
+     x->rnk = rnk;
+     return x;
+}
+
+void tensor_destroy(bench_tensor *sz)
+{
+     bench_free0(sz->dims);
+     bench_free(sz);
+}
+
+size_t tensor_sz(const bench_tensor *sz)
+{
+     int i;
+     size_t n = 1;
+
+     if (!BENCH_FINITE_RNK(sz->rnk))
+          return 0;
+
+     for (i = 0; i < sz->rnk; ++i)
+          n *= sz->dims[i].n;
+     return n;
+}
+
+
+/* total order among bench_iodim's */
+static int dimcmp(const bench_iodim *a, const bench_iodim *b)
+{
+     if (b->is != a->is)
+          return (b->is - a->is);	/* shorter strides go later */
+     if (b->os != a->os)
+          return (b->os - a->os);	/* shorter strides go later */
+     return (int)(a->n - b->n);	        /* larger n's go later */
+}
+
+bench_tensor *tensor_compress(const bench_tensor *sz)
+{
+     int i, rnk;
+     bench_tensor *x;
+
+     BENCH_ASSERT(BENCH_FINITE_RNK(sz->rnk));
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          BENCH_ASSERT(sz->dims[i].n > 0);
+          if (sz->dims[i].n != 1)
+               ++rnk;
+     }
+
+     x = mktensor(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+          if (sz->dims[i].n != 1)
+               x->dims[rnk++] = sz->dims[i];
+     }
+
+     if (rnk) {
+	  /* God knows how qsort() behaves if n==0 */
+	  qsort(x->dims, (size_t)x->rnk, sizeof(bench_iodim),
+		(int (*)(const void *, const void *))dimcmp);
+     }
+
+     return x;
+}
+
+int tensor_unitstridep(bench_tensor *t)
+{
+     BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
+     return (t->rnk == 0 ||
+	     (t->dims[t->rnk - 1].is == 1 && t->dims[t->rnk - 1].os == 1));
+}
+
+/* detect screwy real padded rowmajor... ugh */
+int tensor_real_rowmajorp(bench_tensor *t, int sign, int in_place)
+{
+     int i;
+
+     BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
+
+     i = t->rnk - 1;
+
+     if (--i >= 0) {
+          bench_iodim *d = t->dims + i;
+	  if (sign < 0) {
+	       if (d[0].is != d[1].is * (in_place ? 2*(d[1].n/2 + 1) : d[1].n))
+		    return 0;
+	       if (d[0].os != d[1].os * (d[1].n/2 + 1))
+		    return 0;
+	  }
+	  else {
+	       if (d[0].is != d[1].is * (d[1].n/2 + 1))
+		    return 0;
+	       if (d[0].os != d[1].os * (in_place ? 2*(d[1].n/2 + 1) : d[1].n))
+		    return 0;
+	  }
+     }
+
+     while (--i >= 0) {
+          bench_iodim *d = t->dims + i;
+          if (d[0].is != d[1].is * d[1].n)
+               return 0;
+          if (d[0].os != d[1].os * d[1].n)
+               return 0;
+     }
+     return 1;
+}
+
+int tensor_rowmajorp(bench_tensor *t)
+{
+     int i;
+
+     BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
+
+     i = t->rnk - 1;
+     while (--i >= 0) {
+	  bench_iodim *d = t->dims + i;
+	  if (d[0].is != d[1].is * d[1].n)
+	       return 0;
+	  if (d[0].os != d[1].os * d[1].n)
+	       return 0;
+     }
+     return 1;
+}
+
+static void dimcpy(bench_iodim *dst, const bench_iodim *src, int rnk)
+{
+     int i;
+     if (BENCH_FINITE_RNK(rnk))
+          for (i = 0; i < rnk; ++i)
+               dst[i] = src[i];
+}
+
+bench_tensor *tensor_append(const bench_tensor *a, const bench_tensor *b)
+{
+     if (!BENCH_FINITE_RNK(a->rnk) || !BENCH_FINITE_RNK(b->rnk)) {
+          return mktensor(BENCH_RNK_MINFTY);
+     } else {
+	  bench_tensor *x = mktensor(a->rnk + b->rnk);
+          dimcpy(x->dims, a->dims, a->rnk);
+          dimcpy(x->dims + a->rnk, b->dims, b->rnk);
+	  return x;
+     }
+}
+
+static int imax(int a, int b)
+{
+     return (a > b) ? a : b;
+}
+
+static int imin(int a, int b)
+{
+     return (a < b) ? a : b;
+}
+
+#define DEFBOUNDS(name, xs)			\
+void name(bench_tensor *t, int *lbp, int *ubp)	\
+{						\
+     int lb = 0;				\
+     int ub = 1;				\
+     int i;					\
+						\
+     BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));		\
+						\
+     for (i = 0; i < t->rnk; ++i) {		\
+	  bench_iodim *d = t->dims + i;		\
+	  int n = d->n;				\
+	  int s = d->xs;			\
+	  lb = imin(lb, lb + s * (n - 1));	\
+	  ub = imax(ub, ub + s * (n - 1));	\
+     }						\
+						\
+     *lbp = lb;					\
+     *ubp = ub;					\
+}
+
+DEFBOUNDS(tensor_ibounds, is)
+DEFBOUNDS(tensor_obounds, os)
+
+bench_tensor *tensor_copy(const bench_tensor *sz)
+{
+     bench_tensor *x = mktensor(sz->rnk);
+     dimcpy(x->dims, sz->dims, sz->rnk);
+     return x;
+}
+
+/* Like tensor_copy, but copy only rnk dimensions starting with start_dim. */
+bench_tensor *tensor_copy_sub(const bench_tensor *sz, int start_dim, int rnk)
+{
+     bench_tensor *x;
+
+     BENCH_ASSERT(BENCH_FINITE_RNK(sz->rnk) && start_dim + rnk <= sz->rnk);
+     x = mktensor(rnk);
+     dimcpy(x->dims, sz->dims + start_dim, rnk);
+     return x;
+}
+
+bench_tensor *tensor_copy_swapio(const bench_tensor *sz)
+{
+     bench_tensor *x = tensor_copy(sz);
+     int i;
+     if (BENCH_FINITE_RNK(x->rnk))
+	  for (i = 0; i < x->rnk; ++i) {
+	       int s;
+	       s = x->dims[i].is;
+	       x->dims[i].is = x->dims[i].os;
+	       x->dims[i].os = s;
+	  }
+     return x;
+}
--- a/fftw-3.3.10/libbench2/timer.c
+++ b/fftw-3.3.10/libbench2/timer.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+#include <stdio.h>
+
+/* 
+ * System-dependent timing functions:
+ */
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_BSDGETTIMEOFDAY
+#ifndef HAVE_GETTIMEOFDAY
+#define gettimeofday BSDgettimeofday
+#define HAVE_GETTIMEOFDAY 1
+#endif
+#endif
+
+double time_min;
+int time_repeat;
+
+#if !defined(HAVE_TIMER) && (defined(__WIN32__) || defined(_WIN32) || defined(_WINDOWS) || defined(__CYGWIN__))
+#include <windows.h>
+typedef LARGE_INTEGER mytime;
+
+static mytime get_time(void)
+{
+     mytime tv;
+     QueryPerformanceCounter(&tv);
+     return tv;
+}
+
+static double elapsed(mytime t1, mytime t0)
+{
+     LARGE_INTEGER freq;
+     QueryPerformanceFrequency(&freq);
+     return (((double) t1.QuadPart - (double) t0.QuadPart)) /
+	  ((double) freq.QuadPart);
+}
+
+#define HAVE_TIMER
+#endif
+
+
+#if defined(HAVE_GETTIMEOFDAY) && !defined(HAVE_TIMER)
+typedef struct timeval mytime;
+
+static mytime get_time(void)
+{
+     struct timeval tv;
+     gettimeofday(&tv, 0);
+     return tv;
+}
+
+static double elapsed(mytime t1, mytime t0)
+{
+     return ((double) t1.tv_sec - (double) t0.tv_sec) +
+	  ((double) t1.tv_usec - (double) t0.tv_usec) * 1.0E-6;
+}
+
+#define HAVE_TIMER
+#endif
+
+#ifndef HAVE_TIMER
+#error "timer not defined"
+#endif
+
+static double calibrate(void)
+{
+     /* there seems to be no reasonable way to calibrate the
+	clock automatically any longer.  Grrr... */
+
+     return 0.01;
+}
+
+
+void timer_init(double tmin, int repeat)
+{
+     static int inited = 0;
+
+     if (inited)
+	  return;
+     inited = 1;
+
+     if (!repeat)
+	  repeat = 8;
+     time_repeat = repeat;
+
+     if (tmin > 0)
+	  time_min = tmin;
+     else
+	  time_min = calibrate();
+}
+
+static mytime t0[BENCH_NTIMERS];
+
+void timer_start(int n)
+{
+     BENCH_ASSERT(n >= 0 && n < BENCH_NTIMERS);
+     t0[n] = get_time();
+}
+
+double timer_stop(int n)
+{
+     mytime t1;
+     BENCH_ASSERT(n >= 0 && n < BENCH_NTIMERS);
+     t1 = get_time();
+     return elapsed(t1, t0[n]);
+}
+
--- a/fftw-3.3.10/libbench2/useropt.c
+++ b/fftw-3.3.10/libbench2/useropt.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "libbench2/bench.h"
+
+void useropt(const char *arg)
+{
+     ovtpvt_err("unknown user option: %s.  Ignoring.\n", arg);
+}
--- a/fftw-3.3.10/libbench2/util.c
+++ b/fftw-3.3.10/libbench2/util.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "libbench2/bench.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <math.h>
+
+#if defined(HAVE_MALLOC_H)
+#  include <malloc.h>
+#endif
+
+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+extern void *memalign(size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+void bench_assertion_failed(const char *s, int line, const char *file)
+{
+     ovtpvt_err("bench: %s:%d: assertion failed: %s\n", file, line, s);
+     bench_exit(EXIT_FAILURE);
+}
+
+#ifdef HAVE_DRAND48
+#  if defined(HAVE_DECL_DRAND48) && !HAVE_DECL_DRAND48
+extern double drand48(void);
+#  endif
+double bench_drand(void)
+{
+     return drand48() - 0.5;
+}
+#  if defined(HAVE_DECL_SRAND48) && !HAVE_DECL_SRAND48
+extern void srand48(long);
+#  endif
+void bench_srand(int seed)
+{
+     srand48(seed);
+}
+#else
+double bench_drand(void)
+{
+     double d = rand();
+     return (d / (double) RAND_MAX) - 0.5;
+}
+void bench_srand(int seed)
+{
+     srand(seed);
+}
+#endif
+
+/**********************************************************
+ *   DEBUGGING CODE
+ **********************************************************/
+#ifdef BENCH_DEBUG
+static int bench_malloc_cnt = 0;
+
+/*
+ * debugging malloc/free.  Initialize every malloced and freed area to
+ * random values, just to make sure we are not using uninitialized
+ * pointers.  Also check for writes past the ends of allocated blocks,
+ * and a couple of other things.
+ *
+ * This code is a quick and dirty hack -- use at your own risk.
+ */
+
+static int bench_malloc_total = 0, bench_malloc_max = 0, bench_malloc_cnt_max = 0;
+
+#define MAGIC ((size_t)0xABadCafe)
+#define PAD_FACTOR 2
+#define TWO_SIZE_T (2 * sizeof(size_t))
+
+#define VERBOSE_ALLOCATION 0
+
+#if VERBOSE_ALLOCATION
+#define WHEN_VERBOSE(a) a
+#else
+#define WHEN_VERBOSE(a)
+#endif
+
+void *bench_malloc(size_t n)
+{
+     char *p;
+     size_t i;
+
+     bench_malloc_total += n;
+
+     if (bench_malloc_total > bench_malloc_max)
+	  bench_malloc_max = bench_malloc_total;
+
+     p = (char *) malloc(PAD_FACTOR * n + TWO_SIZE_T);
+     BENCH_ASSERT(p);
+
+     /* store the size in a known position */
+     ((size_t *) p)[0] = n;
+     ((size_t *) p)[1] = MAGIC;
+     for (i = 0; i < PAD_FACTOR * n; i++)
+	  p[i + TWO_SIZE_T] = (char) (i ^ 0xDEADBEEF);
+
+     ++bench_malloc_cnt;
+
+     if (bench_malloc_cnt > bench_malloc_cnt_max)
+	  bench_malloc_cnt_max = bench_malloc_cnt;
+
+     /* skip the size we stored previously */
+     return (void *) (p + TWO_SIZE_T);
+}
+
+void bench_free(void *p)
+{
+     char *q;
+
+     BENCH_ASSERT(p);
+
+     q = ((char *) p) - TWO_SIZE_T;
+     BENCH_ASSERT(q);
+
+     {
+	  size_t n = ((size_t *) q)[0];
+	  size_t magic = ((size_t *) q)[1];
+	  size_t i;
+
+	  ((size_t *) q)[0] = 0; /* set to zero to detect duplicate free's */
+
+	  BENCH_ASSERT(magic == MAGIC);
+	  ((size_t *) q)[1] = ~MAGIC;
+
+	  bench_malloc_total -= n;
+	  BENCH_ASSERT(bench_malloc_total >= 0);
+
+	  /* check for writing past end of array: */
+	  for (i = n; i < PAD_FACTOR * n; ++i)
+	       if (q[i + TWO_SIZE_T] != (char) (i ^ 0xDEADBEEF)) {
+		    BENCH_ASSERT(0 /* array bounds overwritten */);
+	       }
+	  for (i = 0; i < PAD_FACTOR * n; ++i)
+	       q[i + TWO_SIZE_T] = (char) (i ^ 0xBEEFDEAD);
+
+	  --bench_malloc_cnt;
+
+	  BENCH_ASSERT(bench_malloc_cnt >= 0);
+
+	  BENCH_ASSERT(
+	       (bench_malloc_cnt == 0 && bench_malloc_total == 0) ||
+	       (bench_malloc_cnt > 0 && bench_malloc_total > 0));
+
+	  free(q);
+     }
+}
+
+#else
+/**********************************************************
+ *   NON DEBUGGING CODE
+ **********************************************************/
+/* production version, no hacks */
+
+#define MIN_ALIGNMENT 128    /* must be power of two */
+
+#define real_free free /* memalign and malloc use ordinary free */
+
+void *bench_malloc(size_t n)
+{
+     void *p;
+     if (n == 0) n = 1;
+
+#if defined(WITH_OUR_MALLOC)
+     /* Our own aligned malloc/free.  Assumes sizeof(void*) is
+	a power of two <= 8 and that malloc is at least
+	sizeof(void*)-aligned.  Assumes size_t = uintptr_t.  */
+     {
+	  void *p0;
+	  if ((p0 = malloc(n + MIN_ALIGNMENT))) {
+	       p = (void *) (((size_t) p0 + MIN_ALIGNMENT) & (~((size_t) (MIN_ALIGNMENT - 1))));
+	       *((void **) p - 1) = p0;
+	  }
+	  else
+	       p = (void *) 0;
+     }
+#elif defined(HAVE_MEMALIGN)
+     p = memalign(MIN_ALIGNMENT, n);
+#elif defined(HAVE_POSIX_MEMALIGN)
+     /* note: posix_memalign is broken in glibc 2.2.5: it constrains
+	the size, not the alignment, to be (power of two) * sizeof(void*).
+        The bug seems to have been fixed as of glibc 2.3.1. */
+     if (posix_memalign(&p, MIN_ALIGNMENT, n))
+	  p = (void*) 0;
+#elif defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+     /* Intel's C compiler defines _mm_malloc and _mm_free intrinsics */
+     p = (void *) _mm_malloc(n, MIN_ALIGNMENT);
+#    undef real_free
+#    define real_free _mm_free
+#else
+     p = malloc(n);
+#endif
+
+     BENCH_ASSERT(p);
+     return p;
+}
+
+void bench_free(void *p)
+{
+#ifdef WITH_OUR_MALLOC
+     if (p) free(*((void **) p - 1));
+#else
+     real_free(p);
+#endif
+}
+
+#endif
+
+void bench_free0(void *p)
+{
+     if (p) bench_free(p);
+}
--- a/fftw-3.3.10/libbench2/verify-dft.c
+++ b/fftw-3.3.10/libbench2/verify-dft.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+
+/* copy A into B, using output stride of A and input stride of B */
+typedef struct {
+     dotens2_closure k;
+     R *ra; R *ia;
+     R *rb; R *ib;
+     int scalea, scaleb;
+} cpy_closure;
+
+static void cpy0(dotens2_closure *k_, 
+		 int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpy_closure *k = (cpy_closure *)k_;
+     k->rb[indxb * k->scaleb] = k->ra[ondxa * k->scalea];
+     k->ib[indxb * k->scaleb] = k->ia[ondxa * k->scalea];
+     UNUSED(indxa); UNUSED(ondxb);
+}
+
+static void cpy(R *ra, R *ia, const bench_tensor *sza, int scalea,
+		R *rb, R *ib, const bench_tensor *szb, int scaleb)
+{
+     cpy_closure k;
+     k.k.apply = cpy0;
+     k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
+     k.scalea = scalea; k.scaleb = scaleb;
+     bench_dotens2(sza, szb, &k.k);
+}
+
+typedef struct {
+     dofft_closure k;
+     bench_problem *p;
+} dofft_dft_closure;
+
+static void dft_apply(dofft_closure *k_, bench_complex *in, bench_complex *out)
+{
+     dofft_dft_closure *k = (dofft_dft_closure *)k_;
+     bench_problem *p = k->p;
+     bench_tensor *totalsz, *pckdsz;
+     bench_tensor *totalsz_swap, *pckdsz_swap;
+     bench_real *ri, *ii, *ro, *io;
+     int totalscale;
+
+     totalsz = tensor_append(p->vecsz, p->sz);
+     pckdsz = verify_pack(totalsz, 2);
+     ri = (bench_real *) p->in;
+     ro = (bench_real *) p->out;
+
+     totalsz_swap = tensor_copy_swapio(totalsz);
+     pckdsz_swap = tensor_copy_swapio(pckdsz);
+
+     /* confusion: the stride is the distance between complex elements
+	when using interleaved format, but it is the distance between
+	real elements when using split format */
+     if (p->split) {
+	  ii = p->ini ? (bench_real *) p->ini : ri + p->iphyssz;
+	  io = p->outi ? (bench_real *) p->outi : ro + p->ophyssz;
+	  totalscale = 1;
+     } else {
+	  ii = p->ini ? (bench_real *) p->ini : ri + 1;
+	  io = p->outi ? (bench_real *) p->outi : ro + 1;
+	  totalscale = 2;
+     }
+
+     cpy(&c_re(in[0]), &c_im(in[0]), pckdsz, 1,
+	    ri, ii, totalsz, totalscale);
+     after_problem_ccopy_from(p, ri, ii);
+     doit(1, p);
+     after_problem_ccopy_to(p, ro, io);
+     if (k->k.recopy_input)
+	  cpy(ri, ii, totalsz_swap, totalscale,
+	      &c_re(in[0]), &c_im(in[0]), pckdsz_swap, 1);
+     cpy(ro, io, totalsz, totalscale,
+	 &c_re(out[0]), &c_im(out[0]), pckdsz, 1);
+
+     tensor_destroy(totalsz);
+     tensor_destroy(pckdsz);
+     tensor_destroy(totalsz_swap);
+     tensor_destroy(pckdsz_swap);
+}
+
+void verify_dft(bench_problem *p, int rounds, double tol, errors *e)
+{
+     C *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
+     int n, vecn, N;
+     dofft_dft_closure k;
+
+     BENCH_ASSERT(p->kind == PROBLEM_COMPLEX);
+
+     k.k.apply = dft_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+
+     if (rounds == 0)
+	  rounds = 20;  /* default value */
+
+     n = tensor_sz(p->sz);
+     vecn = tensor_sz(p->vecsz);
+     N = n * vecn;
+
+     inA = (C *) bench_malloc(N * sizeof(C));
+     inB = (C *) bench_malloc(N * sizeof(C));
+     inC = (C *) bench_malloc(N * sizeof(C));
+     outA = (C *) bench_malloc(N * sizeof(C));
+     outB = (C *) bench_malloc(N * sizeof(C));
+     outC = (C *) bench_malloc(N * sizeof(C));
+     tmp = (C *) bench_malloc(N * sizeof(C));
+
+     e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, 
+		    tmp, rounds, tol);
+     e->l = linear(&k.k, 0, N, inA, inB, inC, outA, outB, outC,
+		   tmp, rounds, tol);
+
+     e->s = 0.0;
+     e->s = dmax(e->s, tf_shift(&k.k, 0, p->sz, n, vecn, p->sign,
+				inA, inB, outA, outB, 
+				tmp, rounds, tol, TIME_SHIFT));
+     e->s = dmax(e->s, tf_shift(&k.k, 0, p->sz, n, vecn, p->sign,
+				inA, inB, outA, outB, 
+				tmp, rounds, tol, FREQ_SHIFT));
+
+     if (!p->in_place && !p->destroy_input)
+	  preserves_input(&k.k, 0, N, inA, inB, outB, rounds);
+
+     bench_free(tmp);
+     bench_free(outC);
+     bench_free(outB);
+     bench_free(outA);
+     bench_free(inC);
+     bench_free(inB);
+     bench_free(inA);
+}
+
+
+void accuracy_dft(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6])
+{
+     dofft_dft_closure k;
+     int n;
+     C *a, *b;
+
+     BENCH_ASSERT(p->kind == PROBLEM_COMPLEX);
+     BENCH_ASSERT(p->sz->rnk == 1);
+     BENCH_ASSERT(p->vecsz->rnk == 0);
+
+     k.k.apply = dft_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+     n = tensor_sz(p->sz);
+
+     a = (C *) bench_malloc(n * sizeof(C));
+     b = (C *) bench_malloc(n * sizeof(C));
+     accuracy_test(&k.k, 0, p->sign, n, a, b, rounds, impulse_rounds, t);
+     bench_free(b);
+     bench_free(a);
+}
--- a/fftw-3.3.10/libbench2/verify-lib.c
+++ b/fftw-3.3.10/libbench2/verify-lib.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/*
+ * Utility functions:
+ */
+static double dabs(double x) { return (x < 0.0) ? -x : x; }
+static double dmin(double x, double y) { return (x < y) ? x : y; }
+static double norm2(double x, double y) { return dmax(dabs(x), dabs(y)); }
+
+double dmax(double x, double y) { return (x > y) ? x : y; }
+
+static double aerror(C *a, C *b, int n)
+{
+     if (n > 0) {
+	  /* compute the relative Linf error */
+	  double e = 0.0, mag = 0.0;
+	  int i;
+
+	  for (i = 0; i < n; ++i) {
+	       e = dmax(e, norm2(c_re(a[i]) - c_re(b[i]),
+				 c_im(a[i]) - c_im(b[i])));
+	       mag = dmax(mag, 
+			  dmin(norm2(c_re(a[i]), c_im(a[i])),
+			       norm2(c_re(b[i]), c_im(b[i]))));
+	  }
+	  e /= mag;
+
+#ifdef HAVE_ISNAN
+	  BENCH_ASSERT(!isnan(e));
+#endif
+	  return e;
+     } else
+	  return 0.0;
+}
+
+#ifdef HAVE_DRAND48
+#  if defined(HAVE_DECL_DRAND48) && !HAVE_DECL_DRAND48
+extern double drand48(void);
+#  endif
+double mydrand(void)
+{
+     return drand48() - 0.5;
+}
+#else
+double mydrand(void)
+{
+     double d = rand();
+     return (d / (double) RAND_MAX) - 0.5;
+}
+#endif
+
+void arand(C *a, int n)
+{
+     int i;
+
+     /* generate random inputs */
+     for (i = 0; i < n; ++i) {
+	  c_re(a[i]) = mydrand();
+	  c_im(a[i]) = mydrand();
+     }
+}
+
+/* make array real */
+void mkreal(C *A, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+          c_im(A[i]) = 0.0;
+     }
+}
+
+static void assign_conj(C *Ac, C *A, int rank, const bench_iodim *dim, int stride)
+{
+     if (rank == 0) {
+          c_re(*Ac) = c_re(*A);
+          c_im(*Ac) = -c_im(*A);
+     }
+     else {
+          int i, n0 = dim[rank - 1].n, s = stride;
+          rank -= 1;
+	  stride *= n0;
+          assign_conj(Ac, A, rank, dim, stride);
+          for (i = 1; i < n0; ++i)
+               assign_conj(Ac + (n0 - i) * s, A + i * s, rank, dim, stride);
+     }
+}
+
+/* make array hermitian */
+void mkhermitian(C *A, int rank, const bench_iodim *dim, int stride)
+{
+     if (rank == 0)
+          c_im(*A) = 0.0;
+     else {
+          int i, n0 = dim[rank - 1].n, s = stride;
+          rank -= 1;
+	  stride *= n0;
+          mkhermitian(A, rank, dim, stride);
+          for (i = 1; 2*i < n0; ++i)
+               assign_conj(A + (n0 - i) * s, A + i * s, rank, dim, stride);
+          if (2*i == n0)
+               mkhermitian(A + i * s, rank, dim, stride);
+     }
+}
+
+void mkhermitian1(C *a, int n)
+{
+     bench_iodim d;
+
+     d.n = n;
+     d.is = d.os = 1;
+     mkhermitian(a, 1, &d, 1);
+}
+
+/* C = A */
+void acopy(C *c, C *a, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c_re(c[i]) = c_re(a[i]);
+	  c_im(c[i]) = c_im(a[i]);
+     }
+}
+
+/* C = A + B */
+void aadd(C *c, C *a, C *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c_re(c[i]) = c_re(a[i]) + c_re(b[i]);
+	  c_im(c[i]) = c_im(a[i]) + c_im(b[i]);
+     }
+}
+
+/* C = A - B */
+void asub(C *c, C *a, C *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c_re(c[i]) = c_re(a[i]) - c_re(b[i]);
+	  c_im(c[i]) = c_im(a[i]) - c_im(b[i]);
+     }
+}
+
+/* B = rotate left A (complex) */
+void arol(C *b, C *a, int n, int nb, int na)
+{
+     int i, ib, ia;
+
+     for (ib = 0; ib < nb; ++ib) {
+	  for (i = 0; i < n - 1; ++i)
+	       for (ia = 0; ia < na; ++ia) {
+		    C *pb = b + (ib * n + i) * na + ia;
+		    C *pa = a + (ib * n + i + 1) * na + ia;
+		    c_re(*pb) = c_re(*pa);
+		    c_im(*pb) = c_im(*pa);
+	       }
+
+	  for (ia = 0; ia < na; ++ia) {
+	       C *pb = b + (ib * n + n - 1) * na + ia;
+	       C *pa = a + ib * n * na + ia;
+	       c_re(*pb) = c_re(*pa);
+	       c_im(*pb) = c_im(*pa);
+	  }
+     }
+}
+
+void aphase_shift(C *b, C *a, int n, int nb, int na, double sign)
+{
+     int j, jb, ja;
+     trigreal twopin;
+     twopin = K2PI / n;
+
+     for (jb = 0; jb < nb; ++jb)
+	  for (j = 0; j < n; ++j) {
+	       trigreal s = sign * SIN(j * twopin);
+	       trigreal c = COS(j * twopin);
+
+	       for (ja = 0; ja < na; ++ja) {
+		    int k = (jb * n + j) * na + ja;
+		    c_re(b[k]) = c_re(a[k]) * c - c_im(a[k]) * s;
+		    c_im(b[k]) = c_re(a[k]) * s + c_im(a[k]) * c;
+	       }
+	  }
+}
+
+/* A = alpha * A  (complex, in place) */
+void ascale(C *a, C alpha, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  R xr = c_re(a[i]), xi = c_im(a[i]);
+	  c_re(a[i]) = xr * c_re(alpha) - xi * c_im(alpha);
+	  c_im(a[i]) = xr * c_im(alpha) + xi * c_re(alpha);
+     }
+}
+
+
+double acmp(C *a, C *b, int n, const char *test, double tol)
+{
+     double d = aerror(a, b, n);
+     if (d > tol) {
+	  ovtpvt_err("Found relative error %e (%s)\n", d, test);
+
+	  {
+	       int i, N;
+	       N = n > 300 && verbose <= 2 ? 300 : n;
+	       for (i = 0; i < N; ++i) 
+		    ovtpvt_err("%8d %16.12f %16.12f   %16.12f %16.12f\n", i, 
+			       (double) c_re(a[i]), (double) c_im(a[i]),
+			       (double) c_re(b[i]), (double) c_im(b[i]));
+	  }
+
+	  bench_exit(EXIT_FAILURE);
+     }
+     return d;
+}
+
+
+/*
+ * Implementation of the FFT tester described in
+ *
+ * Funda Erg<72>n. Testing multivariate linear functions: Overcoming the
+ * generator bottleneck. In Proceedings of the Twenty-Seventh Annual
+ * ACM Symposium on the Theory of Computing, pages 407-416, Las Vegas,
+ * Nevada, 29 May--1 June 1995.
+ *
+ * Also: F. Ergun, S. R. Kumar, and D. Sivakumar, "Self-testing without
+ * the generator bottleneck," SIAM J. on Computing 29 (5), 1630-51 (2000).
+ */
+
+static double impulse0(dofft_closure *k,
+		       int n, int vecn, 
+		       C *inA, C *inB, C *inC,
+		       C *outA, C *outB, C *outC,
+		       C *tmp, int rounds, double tol)
+{
+     int N = n * vecn;
+     double e = 0.0;
+     int j;
+
+     k->apply(k, inA, tmp);
+     e = dmax(e, acmp(tmp, outA, N, "impulse 1", tol));
+
+     for (j = 0; j < rounds; ++j) {
+	  arand(inB, N);
+	  asub(inC, inA, inB, N);
+	  k->apply(k, inB, outB);
+	  k->apply(k, inC, outC);
+	  aadd(tmp, outB, outC, N);
+	  e = dmax(e, acmp(tmp, outA, N, "impulse", tol));
+     }
+     return e;
+}
+
+double impulse(dofft_closure *k,
+	       int n, int vecn, 
+	       C *inA, C *inB, C *inC,
+	       C *outA, C *outB, C *outC,
+	       C *tmp, int rounds, double tol)
+{
+     int i, j;
+     double e = 0.0;
+
+     /* check impulsive input */
+     for (i = 0; i < vecn; ++i) {
+	  R x = (sqrt(n)*(i+1)) / (double)(vecn+1);
+	  for (j = 0; j < n; ++j) {
+	       c_re(inA[j + i * n]) = 0;
+	       c_im(inA[j + i * n]) = 0;
+	       c_re(outA[j + i * n]) = x;
+	       c_im(outA[j + i * n]) = 0;
+	  }
+	  c_re(inA[i * n]) = x;
+	  c_im(inA[i * n]) = 0;
+     }
+
+     e = dmax(e, impulse0(k, n, vecn, inA, inB, inC, outA, outB, outC,
+			  tmp, rounds, tol));
+
+     /* check constant input */
+     for (i = 0; i < vecn; ++i) {
+	  R x = (i+1) / ((double)(vecn+1) * sqrt(n));
+	  for (j = 0; j < n; ++j) {
+	       c_re(inA[j + i * n]) = x;
+	       c_im(inA[j + i * n]) = 0;
+	       c_re(outA[j + i * n]) = 0;
+	       c_im(outA[j + i * n]) = 0;
+	  }
+	  c_re(outA[i * n]) = n * x;
+	  c_im(outA[i * n]) = 0;
+     }
+
+     e = dmax(e, impulse0(k, n, vecn, inA, inB, inC, outA, outB, outC,
+			  tmp, rounds, tol));
+     return e;
+}
+
+double linear(dofft_closure *k, int realp,
+	      int n, C *inA, C *inB, C *inC, C *outA,
+	      C *outB, C *outC, C *tmp, int rounds, double tol)
+{
+     int j;
+     double e = 0.0;
+
+     for (j = 0; j < rounds; ++j) {
+	  C alpha, beta;
+	  c_re(alpha) = mydrand();
+	  c_im(alpha) = realp ? 0.0 : mydrand();
+	  c_re(beta) = mydrand();
+	  c_im(beta) = realp ? 0.0 : mydrand();
+	  arand(inA, n);
+	  arand(inB, n);
+	  k->apply(k, inA, outA);
+	  k->apply(k, inB, outB);
+
+	  ascale(outA, alpha, n);
+	  ascale(outB, beta, n);
+	  aadd(tmp, outA, outB, n);
+	  ascale(inA, alpha, n);
+	  ascale(inB, beta, n);
+	  aadd(inC, inA, inB, n);
+	  k->apply(k, inC, outC);
+
+	  e = dmax(e, acmp(outC, tmp, n, "linear", tol));
+     }
+     return e;
+}
+
+
+
+double tf_shift(dofft_closure *k,
+		int realp, const bench_tensor *sz,
+		int n, int vecn, double sign,
+		C *inA, C *inB, C *outA, C *outB, C *tmp,
+		int rounds, double tol, int which_shift)
+{
+     int nb, na, dim, N = n * vecn;
+     int i, j;
+     double e = 0.0;
+
+     /* test 3: check the time-shift property */
+     /* the paper performs more tests, but this code should be fine too */
+
+     nb = 1;
+     na = n;
+
+     /* check shifts across all SZ dimensions */
+     for (dim = 0; dim < sz->rnk; ++dim) {
+	  int ncur = sz->dims[dim].n;
+
+	  na /= ncur;
+
+	  for (j = 0; j < rounds; ++j) {
+	       arand(inA, N);
+
+	       if (which_shift == TIME_SHIFT) {
+		    for (i = 0; i < vecn; ++i) {
+			 if (realp) mkreal(inA + i * n, n);
+			 arol(inB + i * n, inA + i * n, ncur, nb, na);
+		    }
+		    k->apply(k, inA, outA);
+		    k->apply(k, inB, outB);
+		    for (i = 0; i < vecn; ++i) 
+			 aphase_shift(tmp + i * n, outB + i * n, ncur, 
+				      nb, na, sign);
+		    e = dmax(e, acmp(tmp, outA, N, "time shift", tol));
+	       } else {
+		    for (i = 0; i < vecn; ++i) {
+			 if (realp) 
+			      mkhermitian(inA + i * n, sz->rnk, sz->dims, 1);
+			 aphase_shift(inB + i * n, inA + i * n, ncur,
+				      nb, na, -sign);
+		    }
+		    k->apply(k, inA, outA);
+		    k->apply(k, inB, outB);
+		    for (i = 0; i < vecn; ++i) 
+			 arol(tmp + i * n, outB + i * n, ncur, nb, na);
+		    e = dmax(e, acmp(tmp, outA, N, "freq shift", tol));
+	       }
+	  }
+
+	  nb *= ncur;
+     }
+     return e;
+}
+
+
+void preserves_input(dofft_closure *k, aconstrain constrain,
+		     int n, C *inA, C *inB, C *outB, int rounds)
+{
+     int j;
+     int recopy_input = k->recopy_input;
+
+     k->recopy_input = 1;
+     for (j = 0; j < rounds; ++j) {
+	  arand(inA, n);
+	  if (constrain)
+	       constrain(inA, n);
+	  
+	  acopy(inB, inA, n);
+	  k->apply(k, inB, outB);
+	  acmp(inB, inA, n, "preserves_input", 0.0);
+     }
+     k->recopy_input = recopy_input;
+}
+
+
+/* Make a copy of the size tensor, with the same dimensions, but with
+   the strides corresponding to a "packed" row-major array with the
+   given stride. */
+bench_tensor *verify_pack(const bench_tensor *sz, int s)
+{
+     bench_tensor *x = tensor_copy(sz);
+     if (BENCH_FINITE_RNK(x->rnk) && x->rnk > 0) {
+	  int i;
+	  x->dims[x->rnk - 1].is = s;
+	  x->dims[x->rnk - 1].os = s;
+	  for (i = x->rnk - 1; i > 0; --i) {
+	       x->dims[i - 1].is = x->dims[i].is * x->dims[i].n;
+	       x->dims[i - 1].os = x->dims[i].os * x->dims[i].n;
+	  }
+     }
+     return x;
+}
+
+static int all_zero(C *a, int n)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  if (c_re(a[i]) != 0.0 || c_im(a[i]) != 0.0)
+	       return 0;
+     return 1;
+}
+
+static int one_accuracy_test(dofft_closure *k, aconstrain constrain,
+			     int sign, int n, C *a, C *b, 
+			     double t[6])
+{
+     double err[6];
+
+     if (constrain)
+	  constrain(a, n);
+     
+     if (all_zero(a, n))
+	  return 0;
+     
+     k->apply(k, a, b);
+     fftaccuracy(n, a, b, sign, err);
+     
+     t[0] += err[0];
+     t[1] += err[1] * err[1];
+     t[2] = dmax(t[2], err[2]);
+     t[3] += err[3];
+     t[4] += err[4] * err[4];
+     t[5] = dmax(t[5], err[5]);
+
+     return 1;
+}
+
+void accuracy_test(dofft_closure *k, aconstrain constrain,
+		   int sign, int n, C *a, C *b, int rounds, int impulse_rounds,
+		   double t[6])
+{
+     int r, i;
+     int ntests = 0;
+     bench_complex czero = {0, 0};
+
+     for (i = 0; i < 6; ++i) t[i] = 0.0;
+
+     for (r = 0; r < rounds; ++r) {
+	  arand(a, n);
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+
+     /* impulses at beginning of array */
+     for (r = 0; r < impulse_rounds; ++r) {
+	  if (r > n - r - 1)
+	       continue;
+	  
+	  caset(a, n, czero);
+	  c_re(a[r]) = c_im(a[r]) = 1.0;
+	  
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+     
+     /* impulses at end of array */
+     for (r = 0; r < impulse_rounds; ++r) {
+	  if (r <= n - r - 1)
+	       continue;
+	  
+	  caset(a, n, czero);
+	  c_re(a[n - r - 1]) = c_im(a[n - r - 1]) = 1.0;
+	  
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+     
+     /* randomly-located impulses */
+     for (r = 0; r < impulse_rounds; ++r) {
+	  caset(a, n, czero);
+	  i = rand() % n;
+	  c_re(a[i]) = c_im(a[i]) = 1.0;
+	  
+	  if (one_accuracy_test(k, constrain, sign, n, a, b, t))
+	       ++ntests;
+     }
+
+     t[0] /= ntests;
+     t[1] = sqrt(t[1] / ntests);
+     t[3] /= ntests;
+     t[4] = sqrt(t[4] / ntests);
+
+     fftaccuracy_done();
+}
--- a/fftw-3.3.10/libbench2/verify-r2r.c
+++ b/fftw-3.3.10/libbench2/verify-r2r.c
@@ -0,0 +1,964 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Lots of ugly duplication from verify-lib.c, plus lots of ugliness in
+   general for all of the r2r variants...oh well, for now */
+
+#include "verify.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef struct {
+     bench_problem *p;
+     bench_tensor *probsz;
+     bench_tensor *totalsz;
+     bench_tensor *pckdsz;
+     bench_tensor *pckdvecsz;
+} info;
+
+/*
+ * Utility functions:
+ */
+
+static double dabs(double x) { return (x < 0.0) ? -x : x; }
+static double dmin(double x, double y) { return (x < y) ? x : y; }
+
+static double raerror(R *a, R *b, int n)
+{
+     if (n > 0) {
+          /* compute the relative Linf error */
+          double e = 0.0, mag = 0.0;
+          int i;
+
+          for (i = 0; i < n; ++i) {
+               e = dmax(e, dabs(a[i] - b[i]));
+               mag = dmax(mag, dmin(dabs(a[i]), dabs(b[i])));
+          }
+	  if (dabs(mag) < 1e-14 && dabs(e) < 1e-14)
+	       e = 0.0;
+	  else
+	       e /= mag;
+
+#ifdef HAVE_ISNAN
+          BENCH_ASSERT(!isnan(e));
+#endif
+          return e;
+     } else
+          return 0.0;
+}
+
+#define by2pi(m, n) ((K2PI * (m)) / (n))
+
+/*
+ * Improve accuracy by reducing x to range [0..1/8]
+ * before multiplication by 2 * PI.
+ */
+
+static trigreal bench_sincos(trigreal m, trigreal n, int sinp)
+{
+     /* waiting for C to get tail recursion... */
+     trigreal half_n = n * 0.5;
+     trigreal quarter_n = half_n * 0.5;
+     trigreal eighth_n = quarter_n * 0.5;
+     trigreal sgn = 1.0;
+
+     if (sinp) goto sin;
+ cos:
+     if (m < 0) { m = -m; /* goto cos; */ }
+     if (m > half_n) { m = n - m; goto cos; }
+     if (m > eighth_n) { m = quarter_n - m; goto sin; }
+     return sgn * COS(by2pi(m, n));
+
+ msin:
+     sgn = -sgn;
+ sin:
+     if (m < 0) { m = -m; goto msin; }
+     if (m > half_n) { m = n - m; goto msin; }
+     if (m > eighth_n) { m = quarter_n - m; goto cos; }
+     return sgn * SIN(by2pi(m, n));
+}
+
+static trigreal cos2pi(int m, int n)
+{
+     return bench_sincos((trigreal)m, (trigreal)n, 0);
+}
+
+static trigreal sin2pi(int m, int n)
+{
+     return bench_sincos((trigreal)m, (trigreal)n, 1);
+}
+
+static trigreal cos00(int i, int j, int n)
+{
+     return cos2pi(i * j, n);
+}
+
+static trigreal cos01(int i, int j, int n)
+{
+     return cos00(i, 2*j + 1, 2*n);
+}
+
+static trigreal cos10(int i, int j, int n)
+{
+     return cos00(2*i + 1, j, 2*n);
+}
+
+static trigreal cos11(int i, int j, int n)
+{
+     return cos00(2*i + 1, 2*j + 1, 4*n);
+}
+
+static trigreal sin00(int i, int j, int n)
+{
+     return sin2pi(i * j, n);
+}
+
+static trigreal sin01(int i, int j, int n)
+{
+     return sin00(i, 2*j + 1, 2*n);
+}
+
+static trigreal sin10(int i, int j, int n)
+{
+     return sin00(2*i + 1, j, 2*n);
+}
+
+static trigreal sin11(int i, int j, int n)
+{
+     return sin00(2*i + 1, 2*j + 1, 4*n);
+}
+
+static trigreal realhalf(int i, int j, int n)
+{
+     UNUSED(i);
+     if (j <= n - j)
+	  return 1.0;
+     else
+	  return 0.0;
+}
+
+static trigreal coshalf(int i, int j, int n)
+{
+     if (j <= n - j)
+	  return cos00(i, j, n);
+     else
+	  return cos00(i, n - j, n);
+}
+
+static trigreal unity(int i, int j, int n)
+{
+     UNUSED(i);
+     UNUSED(j);
+     UNUSED(n);
+     return 1.0;
+}
+
+typedef trigreal (*trigfun)(int, int, int);
+
+static void rarand(R *a, int n)
+{
+     int i;
+
+     /* generate random inputs */
+     for (i = 0; i < n; ++i) {
+	  a[i] = mydrand();
+     }
+}
+
+/* C = A + B */
+static void raadd(R *c, R *a, R *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c[i] = a[i] + b[i];
+     }
+}
+
+/* C = A - B */
+static void rasub(R *c, R *a, R *b, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  c[i] = a[i] - b[i];
+     }
+}
+
+/* B = rotate left A + rotate right A */
+static void rarolr(R *b, R *a, int n, int nb, int na, 
+		   r2r_kind_t k)
+{
+     int isL0 = 0, isL1 = 0, isR0 = 0, isR1 = 0;
+     int i, ib, ia;
+
+     for (ib = 0; ib < nb; ++ib) {
+	  for (i = 0; i < n - 1; ++i)
+	       for (ia = 0; ia < na; ++ia)
+		    b[(ib * n + i) * na + ia] =
+			 a[(ib * n + i + 1) * na + ia];
+
+	  /* ugly switch to do boundary conditions for various r2r types */
+	  switch (k) {
+	       /* periodic boundaries */
+	      case R2R_DHT:
+	      case R2R_R2HC:
+		   for (ia = 0; ia < na; ++ia) {
+			b[(ib * n + n - 1) * na + ia] = 
+			     a[(ib * n + 0) * na + ia];
+			b[(ib * n + 0) * na + ia] += 
+			     a[(ib * n + n - 1) * na + ia];
+		   }
+		   break;
+		   
+	      case R2R_HC2R: /* ugh (hermitian halfcomplex boundaries) */
+		   if (n > 2) {
+			if (n % 2 == 0)
+			     for (ia = 0; ia < na; ++ia) {
+				  b[(ib * n + n - 1) * na + ia] = 0.0;
+				  b[(ib * n + 0) * na + ia] += 
+				       a[(ib * n + 1) * na + ia];
+				  b[(ib * n + n/2) * na + ia] += 
+				       + a[(ib * n + n/2 - 1) * na + ia]
+				       - a[(ib * n + n/2 + 1) * na + ia];
+				  b[(ib * n + n/2 + 1) * na + ia] += 
+				       - a[(ib * n + n/2) * na + ia];
+			     }
+			else 
+			     for (ia = 0; ia < na; ++ia) {
+				  b[(ib * n + n - 1) * na + ia] = 0.0;
+				  b[(ib * n + 0) * na + ia] += 
+				       a[(ib * n + 1) * na + ia];
+				  b[(ib * n + n/2) * na + ia] += 
+				       + a[(ib * n + n/2) * na + ia]
+				       - a[(ib * n + n/2 + 1) * na + ia];
+				  b[(ib * n + n/2 + 1) * na + ia] += 
+				       - a[(ib * n + n/2 + 1) * na + ia]
+				       - a[(ib * n + n/2) * na + ia];
+			     }
+		   } else /* n <= 2 */ {
+			for (ia = 0; ia < na; ++ia) {
+			     b[(ib * n + n - 1) * na + ia] =
+				  a[(ib * n + 0) * na + ia];
+			     b[(ib * n + 0) * na + ia] += 
+				  a[(ib * n + n - 1) * na + ia];
+			}
+		   }
+		   break;
+		   
+	      /* various even/odd boundary conditions */
+	      case R2R_REDFT00:
+		   isL1 = isR1 = 1;
+		   goto mirrors;
+	      case R2R_REDFT01:
+		   isL1 = 1;
+		   goto mirrors;
+	      case R2R_REDFT10:
+		   isL0 = isR0 = 1;
+		   goto mirrors;
+	      case R2R_REDFT11:
+		   isL0 = 1;
+		   isR0 = -1;
+		   goto mirrors;
+	      case R2R_RODFT00:
+		   goto mirrors;
+	      case R2R_RODFT01:
+		   isR1 = 1;
+		   goto mirrors;
+	      case R2R_RODFT10:
+		   isL0 = isR0 = -1;
+		   goto mirrors;
+	      case R2R_RODFT11:
+		   isL0 = -1;
+		   isR0 = 1;
+		   goto mirrors;
+
+	  mirrors:
+		   
+		   for (ia = 0; ia < na; ++ia)
+			b[(ib * n + n - 1) * na + ia] = 
+			     isR0 * a[(ib * n + n - 1) * na + ia]
+			     + (n > 1 ? isR1 * a[(ib * n + n - 2) * na + ia]
+				: 0);
+		   
+		   for (ia = 0; ia < na; ++ia)
+			b[(ib * n) * na + ia] += 
+			     isL0 * a[(ib * n) * na + ia]
+			     + (n > 1 ? isL1 * a[(ib * n + 1) * na + ia] : 0);
+		   
+	  }
+
+	  for (i = 1; i < n; ++i)
+	       for (ia = 0; ia < na; ++ia)
+		    b[(ib * n + i) * na + ia] +=
+			 a[(ib * n + i - 1) * na + ia];
+     }
+}
+
+static void raphase_shift(R *b, R *a, int n, int nb, int na,
+			 int n0, int k0, trigfun t)
+{
+     int j, jb, ja;
+ 
+     for (jb = 0; jb < nb; ++jb)
+          for (j = 0; j < n; ++j) {
+               trigreal c = 2.0 * t(1, j + k0, n0);
+
+               for (ja = 0; ja < na; ++ja) {
+                    int k = (jb * n + j) * na + ja;
+                    b[k] = a[k] * c;
+               }
+          }
+}
+
+/* A = alpha * A  (real, in place) */
+static void rascale(R *a, R alpha, int n)
+{
+     int i;
+
+     for (i = 0; i < n; ++i) {
+	  a[i] *= alpha;
+     }
+}
+
+/*
+ * compute rdft:
+ */
+
+/* copy real A into real B, using output stride of A and input stride of B */
+typedef struct {
+     dotens2_closure k;
+     R *ra;
+     R *rb;
+} cpyr_closure;
+
+static void cpyr0(dotens2_closure *k_, 
+		  int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyr_closure *k = (cpyr_closure *)k_;
+     k->rb[indxb] = k->ra[ondxa];
+     UNUSED(indxa); UNUSED(ondxb);
+}
+
+static void cpyr(R *ra, bench_tensor *sza, R *rb, bench_tensor *szb)
+{
+     cpyr_closure k;
+     k.k.apply = cpyr0;
+     k.ra = ra; k.rb = rb;
+     bench_dotens2(sza, szb, &k.k);
+}
+
+static void dofft(info *nfo, R *in, R *out)
+{
+     cpyr(in, nfo->pckdsz, (R *) nfo->p->in, nfo->totalsz);
+     after_problem_rcopy_from(nfo->p, (bench_real *)nfo->p->in);
+     doit(1, nfo->p);
+     after_problem_rcopy_to(nfo->p, (bench_real *)nfo->p->out);
+     cpyr((R *) nfo->p->out, nfo->totalsz, out, nfo->pckdsz);
+}
+
+static double racmp(R *a, R *b, int n, const char *test, double tol)
+{
+     double d = raerror(a, b, n);
+     if (d > tol) {
+	  ovtpvt_err("Found relative error %e (%s)\n", d, test);
+	  {
+	       int i, N;
+	       N = n > 300 && verbose <= 2 ? 300 : n;
+	       for (i = 0; i < N; ++i)
+		    ovtpvt_err("%8d %16.12f   %16.12f\n", i, 
+			       (double) a[i],
+			       (double) b[i]);
+	  }
+	  bench_exit(EXIT_FAILURE);
+     }
+     return d;
+}
+
+/***********************************************************************/
+
+typedef struct {
+     int n; /* physical size */
+     int n0; /* "logical" transform size */
+     int i0, k0; /* shifts of input/output */
+     trigfun ti, ts;  /* impulse/shift trig functions */
+} dim_stuff;
+
+static void impulse_response(int rnk, dim_stuff *d, R impulse_amp,
+			     R *A, int N)
+{
+     if (rnk == 0)
+	  A[0] = impulse_amp;
+     else {
+	  int i;
+	  N /= d->n;
+	  for (i = 0; i < d->n; ++i) {
+	       impulse_response(rnk - 1, d + 1,
+				impulse_amp * d->ti(d->i0, d->k0 + i, d->n0),
+				A + i * N, N);
+	  }
+     }
+}
+
+/***************************************************************************/
+
+/*
+ * Implementation of the FFT tester described in
+ *
+ * Funda Erg<72>n. Testing multivariate linear functions: Overcoming the
+ * generator bottleneck. In Proceedings of the Twenty-Seventh Annual
+ * ACM Symposium on the Theory of Computing, pages 407-416, Las Vegas,
+ * Nevada, 29 May--1 June 1995.
+ *
+ * Also: F. Ergun, S. R. Kumar, and D. Sivakumar, "Self-testing without
+ * the generator bottleneck," SIAM J. on Computing 29 (5), 1630-51 (2000).
+ */
+
+static double rlinear(int n, info *nfo, R *inA, R *inB, R *inC, R *outA,
+		      R *outB, R *outC, R *tmp, int rounds, double tol)
+{
+     double e = 0.0;
+     int j;
+
+     for (j = 0; j < rounds; ++j) {
+	  R alpha, beta;
+	  alpha = mydrand();
+	  beta = mydrand();
+	  rarand(inA, n);
+	  rarand(inB, n);
+	  dofft(nfo, inA, outA);
+	  dofft(nfo, inB, outB);
+
+	  rascale(outA, alpha, n);
+	  rascale(outB, beta, n);
+	  raadd(tmp, outA, outB, n);
+	  rascale(inA, alpha, n);
+	  rascale(inB, beta, n);
+	  raadd(inC, inA, inB, n);
+	  dofft(nfo, inC, outC);
+
+	  e = dmax(e, racmp(outC, tmp, n, "linear", tol));
+     }
+     return e;
+}
+
+static double rimpulse(dim_stuff *d, R impulse_amp,
+		       int n, int vecn, info *nfo, 
+		       R *inA, R *inB, R *inC,
+		       R *outA, R *outB, R *outC,
+		       R *tmp, int rounds, double tol)
+{
+     double e = 0.0;
+     int N = n * vecn;
+     int i;
+     int j;
+
+     /* test 2: check that the unit impulse is transformed properly */
+
+     for (i = 0; i < N; ++i) {
+	  /* pls */
+	  inA[i] = 0.0;
+     }
+     for (i = 0; i < vecn; ++i) {
+	  inA[i * n] = (i+1) / (double)(vecn+1);
+     
+	  /* transform of the pls */
+	  impulse_response(nfo->probsz->rnk, d, impulse_amp * inA[i * n],
+			   outA + i * n, n);
+     }
+
+     dofft(nfo, inA, tmp);
+     e = dmax(e, racmp(tmp, outA, N, "impulse 1", tol));
+
+     for (j = 0; j < rounds; ++j) {
+          rarand(inB, N);
+          rasub(inC, inA, inB, N);
+          dofft(nfo, inB, outB);
+          dofft(nfo, inC, outC);
+          raadd(tmp, outB, outC, N);
+          e = dmax(e, racmp(tmp, outA, N, "impulse", tol));
+     }
+     return e;
+}
+
+static double t_shift(int n, int vecn, info *nfo, 
+		      R *inA, R *inB, R *outA, R *outB, R *tmp,
+		      int rounds, double tol,
+		      dim_stuff *d)
+{
+     double e = 0.0;
+     int nb, na, dim, N = n * vecn;
+     int i, j;
+     bench_tensor *sz = nfo->probsz;
+
+     /* test 3: check the time-shift property */
+     /* the paper performs more tests, but this code should be fine too */
+
+     nb = 1;
+     na = n;
+
+     /* check shifts across all SZ dimensions */
+     for (dim = 0; dim < sz->rnk; ++dim) {
+	  int ncur = sz->dims[dim].n;
+
+	  na /= ncur;
+
+	  for (j = 0; j < rounds; ++j) {
+	       rarand(inA, N);
+
+	       for (i = 0; i < vecn; ++i) {
+		    rarolr(inB + i * n, inA + i*n, ncur, nb,na, 
+			  nfo->p->k[dim]);
+	       }
+	       dofft(nfo, inA, outA);
+	       dofft(nfo, inB, outB);
+	       for (i = 0; i < vecn; ++i) 
+		    raphase_shift(tmp + i * n, outA + i * n, ncur, 
+				 nb, na, d[dim].n0, d[dim].k0, d[dim].ts);
+	       e = dmax(e, racmp(tmp, outB, N, "time shift", tol));
+	  }
+
+	  nb *= ncur;
+     }
+     return e;
+}
+
+/***********************************************************************/
+
+void verify_r2r(bench_problem *p, int rounds, double tol, errors *e)
+{
+     R *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
+     info nfo;
+     int n, vecn, N;
+     double impulse_amp = 1.0;
+     dim_stuff *d;
+     int i;
+
+     if (rounds == 0)
+	  rounds = 20;  /* default value */
+
+     n = tensor_sz(p->sz);
+     vecn = tensor_sz(p->vecsz);
+     N = n * vecn;
+
+     d = (dim_stuff *) bench_malloc(sizeof(dim_stuff) * p->sz->rnk);
+     for (i = 0; i < p->sz->rnk; ++i) {
+	  int n0, i0, k0;
+	  trigfun ti, ts;
+
+	  d[i].n = n0 = p->sz->dims[i].n;
+	  if (p->k[i] > R2R_DHT)
+	       n0 = 2 * (n0 + (p->k[i] == R2R_REDFT00 ? -1 : 
+			       (p->k[i] == R2R_RODFT00 ? 1 : 0)));
+	  
+	  switch (p->k[i]) {
+	      case R2R_R2HC:
+		   i0 = k0 = 0;
+		   ti = realhalf;
+		   ts = coshalf;
+		   break;
+	      case R2R_DHT:
+		   i0 = k0 = 0;
+		   ti = unity;
+		   ts = cos00;
+		   break;
+	      case R2R_HC2R:
+		   i0 = k0 = 0;
+		   ti = unity;
+		   ts = cos00;
+		   break;
+	      case R2R_REDFT00:
+		   i0 = k0 = 0;
+		   ti = ts = cos00;
+		   break;
+	      case R2R_REDFT01:
+		   i0 = k0 = 0;
+		   ti = ts = cos01;
+		   break;
+	      case R2R_REDFT10:
+		   i0 = k0 = 0;
+		   ti = cos10; impulse_amp *= 2.0;
+		   ts = cos00;
+		   break;
+	      case R2R_REDFT11:
+		   i0 = k0 = 0;
+		   ti = cos11; impulse_amp *= 2.0;
+		   ts = cos01;
+		   break;
+	      case R2R_RODFT00:
+		   i0 = k0 = 1;
+		   ti = sin00; impulse_amp *= 2.0;
+		   ts = cos00;
+		   break;
+	      case R2R_RODFT01:
+		   i0 = 1; k0 = 0;
+		   ti = sin01; impulse_amp *= n == 1 ? 1.0 : 2.0;
+		   ts = cos01;
+		   break;
+	      case R2R_RODFT10:
+		   i0 = 0; k0 = 1;
+		   ti = sin10; impulse_amp *= 2.0;
+		   ts = cos00;
+		   break;
+	      case R2R_RODFT11:
+		   i0 = k0 = 0;
+		   ti = sin11; impulse_amp *= 2.0;
+		   ts = cos01;
+		   break;
+	      default:
+		   BENCH_ASSERT(0);
+		   return;
+	  }
+
+	  d[i].n0 = n0;
+	  d[i].i0 = i0;
+	  d[i].k0 = k0;
+	  d[i].ti = ti;
+	  d[i].ts = ts;
+     }
+
+
+     inA = (R *) bench_malloc(N * sizeof(R));
+     inB = (R *) bench_malloc(N * sizeof(R));
+     inC = (R *) bench_malloc(N * sizeof(R));
+     outA = (R *) bench_malloc(N * sizeof(R));
+     outB = (R *) bench_malloc(N * sizeof(R));
+     outC = (R *) bench_malloc(N * sizeof(R));
+     tmp = (R *) bench_malloc(N * sizeof(R));
+
+     nfo.p = p;
+     nfo.probsz = p->sz;
+     nfo.totalsz = tensor_append(p->vecsz, nfo.probsz);
+     nfo.pckdsz = verify_pack(nfo.totalsz, 1);
+     nfo.pckdvecsz = verify_pack(p->vecsz, tensor_sz(nfo.probsz));
+
+     e->i = rimpulse(d, impulse_amp, n, vecn, &nfo,
+		     inA, inB, inC, outA, outB, outC, tmp, rounds, tol);
+     e->l = rlinear(N, &nfo, inA, inB, inC, outA, outB, outC, tmp, rounds,tol);
+     e->s = t_shift(n, vecn, &nfo, inA, inB, outA, outB, tmp, 
+		    rounds, tol, d);
+
+     /* grr, verify-lib.c:preserves_input() only works for complex */
+     if (!p->in_place && !p->destroy_input) {
+	  bench_tensor *totalsz_swap, *pckdsz_swap;
+	  totalsz_swap = tensor_copy_swapio(nfo.totalsz);
+	  pckdsz_swap = tensor_copy_swapio(nfo.pckdsz);
+
+	  for (i = 0; i < rounds; ++i) {
+	       rarand(inA, N);
+	       dofft(&nfo, inA, outB);
+	       cpyr((R *) nfo.p->in, totalsz_swap, inB, pckdsz_swap);
+	       racmp(inB, inA, N, "preserves_input", 0.0);
+	  }
+
+	  tensor_destroy(totalsz_swap);
+	  tensor_destroy(pckdsz_swap);
+     }
+
+     tensor_destroy(nfo.totalsz);
+     tensor_destroy(nfo.pckdsz);
+     tensor_destroy(nfo.pckdvecsz);
+     bench_free(tmp);
+     bench_free(outC);
+     bench_free(outB);
+     bench_free(outA);
+     bench_free(inC);
+     bench_free(inB);
+     bench_free(inA);
+     bench_free(d);
+}
+
+
+typedef struct {
+     dofft_closure k;
+     bench_problem *p;
+     int n0;
+} dofft_r2r_closure;
+
+static void cpyr1(int n, R *in, int is, R *out, int os, R scale)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  out[i * os] = in[i * is] * scale;
+}
+
+static void mke00(C *a, int n, int c)
+{
+     int i;
+     for (i = 1; i + i < n; ++i)
+	  a[n - i][c] = a[i][c];
+}
+
+static void mkre00(C *a, int n)
+{
+     mkreal(a, n);
+     mke00(a, n, 0);
+}
+
+static void mkimag(C *a, int n)
+{
+     int i;
+     for (i = 0; i < n; ++i)
+	  c_re(a[i]) = 0.0;
+}
+
+static void mko00(C *a, int n, int c)
+{
+     int i;
+     a[0][c] = 0.0;
+     for (i = 1; i + i < n; ++i)
+	  a[n - i][c] = -a[i][c];
+     if (i + i == n)
+	  a[i][c] = 0.0;
+}
+
+static void mkro00(C *a, int n)
+{
+     mkreal(a, n);
+     mko00(a, n, 0);
+}
+
+static void mkio00(C *a, int n)
+{
+     mkimag(a, n);
+     mko00(a, n, 1);
+}
+
+static void mkre01(C *a, int n) /* n should be be multiple of 4 */
+{
+     R a0;
+     a0 = c_re(a[0]);
+     mko00(a, n/2, 0);
+     c_re(a[n/2]) = -(c_re(a[0]) = a0);
+     mkre00(a, n);
+}
+
+static void mkro01(C *a, int n) /* n should be be multiple of 4 */
+{
+     c_re(a[0]) = c_im(a[0]) = 0.0;
+     mkre00(a, n/2);
+     mkro00(a, n);
+}
+
+static void mkoddonly(C *a, int n)
+{
+     int i;
+     for (i = 0; i < n; i += 2)
+	  c_re(a[i]) = c_im(a[i]) = 0.0;
+}
+
+static void mkre10(C *a, int n)
+{
+     mkoddonly(a, n);
+     mkre00(a, n);
+}
+
+static void mkio10(C *a, int n)
+{
+     mkoddonly(a, n);
+     mkio00(a, n);
+}
+
+static void mkre11(C *a, int n)
+{
+     mkoddonly(a, n);
+     mko00(a, n/2, 0);
+     mkre00(a, n);
+}
+
+static void mkro11(C *a, int n)
+{
+     mkoddonly(a, n);
+     mkre00(a, n/2);
+     mkro00(a, n);
+}
+
+static void mkio11(C *a, int n)
+{
+     mkoddonly(a, n);
+     mke00(a, n/2, 1);
+     mkio00(a, n);
+}
+
+static void r2r_apply(dofft_closure *k_, bench_complex *in, bench_complex *out)
+{
+     dofft_r2r_closure *k = (dofft_r2r_closure *)k_;
+     bench_problem *p = k->p;
+     bench_real *ri, *ro;
+     int n, is, os;
+
+     n = p->sz->dims[0].n;
+     is = p->sz->dims[0].is;
+     os = p->sz->dims[0].os;
+
+     ri = (bench_real *) p->in;
+     ro = (bench_real *) p->out;
+
+     switch (p->k[0]) {
+	 case R2R_R2HC:
+	      cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_HC2R:
+	      cpyr1(n/2 + 1, &c_re(in[0]), 2, ri, is, 1.0);
+	      cpyr1((n+1)/2 - 1, &c_im(in[n-1]), -2, ri + is*(n-1), -is, 1.0);
+	      break;
+	 case R2R_REDFT00:
+	      cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT00:
+	      cpyr1(n, &c_re(in[1]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_REDFT01:
+	      cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_REDFT10:
+	      cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT01:
+	      cpyr1(n, &c_re(in[1]), 2, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT10:
+	      cpyr1(n, &c_im(in[1]), 4, ri, is, 1.0);
+	      break;
+	 case R2R_REDFT11:
+	      cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
+	      break;
+	 case R2R_RODFT11:
+	      cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
+	      break;
+	 default:
+	      BENCH_ASSERT(0); /* not yet implemented */
+     }
+
+     after_problem_rcopy_from(p, ri);
+     doit(1, p);
+     after_problem_rcopy_to(p, ro);
+
+     switch (p->k[0]) {
+	 case R2R_R2HC:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
+	      cpyr1(n/2 + 1, ro, os, &c_re(out[0]), 2, 1.0);
+	      cpyr1((n+1)/2 - 1, ro + os*(n-1), -os, &c_im(out[1]), 2, 1.0);
+	      c_im(out[0]) = 0.0;
+	      if (n % 2 == 0)
+		   c_im(out[n/2]) = 0.0;
+	      mkhermitian1(out, n);
+	      break;
+	 case R2R_HC2R:
+	      if (k->k.recopy_input) {
+		   cpyr1(n/2 + 1, ri, is, &c_re(in[0]), 2, 1.0);
+		   cpyr1((n+1)/2 - 1, ri + is*(n-1), -is, &c_im(in[1]), 2,1.0);
+	      }
+	      cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
+	      mkreal(out, n);
+	      break;
+	 case R2R_REDFT00:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
+	      cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
+	      mkre00(out, k->n0);
+	      break;
+	 case R2R_RODFT00:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_im(in[1]), 2, -1.0);
+	      cpyr1(n, ro, os, &c_im(out[1]), 2, -1.0);
+	      mkio00(out, k->n0);
+	      break;
+	 case R2R_REDFT01:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
+	      cpyr1(n, ro, os, &c_re(out[1]), 4, 2.0);
+	      mkre10(out, k->n0);
+	      break;
+	 case R2R_REDFT10:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[1]), 4, 2.0);
+	      cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
+	      mkre01(out, k->n0);
+	      break;
+	 case R2R_RODFT01:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[1]), 2, 1.0);
+	      cpyr1(n, ro, os, &c_im(out[1]), 4, -2.0);
+	      mkio10(out, k->n0);
+	      break;
+	 case R2R_RODFT10:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_im(in[1]), 4, -2.0);
+	      cpyr1(n, ro, os, &c_re(out[1]), 2, 1.0);
+	      mkro01(out, k->n0);
+	      break;
+	 case R2R_REDFT11:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_re(in[1]), 4, 2.0);
+	      cpyr1(n, ro, os, &c_re(out[1]), 4, 2.0);
+	      mkre11(out, k->n0);
+	      break;
+	 case R2R_RODFT11:
+	      if (k->k.recopy_input)
+		   cpyr1(n, ri, is, &c_im(in[1]), 4, -2.0);
+	      cpyr1(n, ro, os, &c_im(out[1]), 4, -2.0);
+	      mkio11(out, k->n0);
+	      break;
+	 default:
+	      BENCH_ASSERT(0); /* not yet implemented */
+     }
+}
+
+void accuracy_r2r(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6])
+{
+     dofft_r2r_closure k;
+     int n, n0 = 1;
+     C *a, *b;
+     aconstrain constrain = 0;
+
+     BENCH_ASSERT(p->kind == PROBLEM_R2R);
+     BENCH_ASSERT(p->sz->rnk == 1);
+     BENCH_ASSERT(p->vecsz->rnk == 0);
+
+     k.k.apply = r2r_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+     n = tensor_sz(p->sz);
+     
+     switch (p->k[0]) {
+         case R2R_R2HC: constrain = mkreal; n0 = n; break;
+         case R2R_HC2R: constrain = mkhermitian1; n0 = n; break;
+         case R2R_REDFT00: constrain = mkre00; n0 = 2*(n-1); break;
+         case R2R_RODFT00: constrain = mkro00; n0 = 2*(n+1); break;
+         case R2R_REDFT01: constrain = mkre01; n0 = 4*n; break;
+         case R2R_REDFT10: constrain = mkre10; n0 = 4*n; break;
+         case R2R_RODFT01: constrain = mkro01; n0 = 4*n; break;
+         case R2R_RODFT10: constrain = mkio10; n0 = 4*n; break;
+         case R2R_REDFT11: constrain = mkre11; n0 = 8*n; break;
+         case R2R_RODFT11: constrain = mkro11; n0 = 8*n; break;
+	 default: BENCH_ASSERT(0); /* not yet implemented */
+     }
+     k.n0 = n0;
+
+     a = (C *) bench_malloc(n0 * sizeof(C));
+     b = (C *) bench_malloc(n0 * sizeof(C));
+     accuracy_test(&k.k, constrain, -1, n0, a, b, rounds, impulse_rounds, t);
+     bench_free(b);
+     bench_free(a);
+}
--- a/fftw-3.3.10/libbench2/verify-rdft2.c
+++ b/fftw-3.3.10/libbench2/verify-rdft2.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "verify.h"
+
+/* copy real A into real B, using output stride of A and input stride of B */
+typedef struct {
+     dotens2_closure k;
+     R *ra;
+     R *rb;
+} cpyr_closure;
+
+static void cpyr0(dotens2_closure *k_,
+                  int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyr_closure *k = (cpyr_closure *)k_;
+     k->rb[indxb] = k->ra[ondxa];
+     UNUSED(indxa); UNUSED(ondxb);
+}
+
+static void cpyr(R *ra, const bench_tensor *sza, 
+		 R *rb, const bench_tensor *szb)
+{
+     cpyr_closure k;
+     k.k.apply = cpyr0;
+     k.ra = ra; k.rb = rb;
+     bench_dotens2(sza, szb, &k.k);
+}
+
+/* copy unpacked halfcomplex A[n] into packed-complex B[n], using output stride
+   of A and input stride of B.  Only copies non-redundant half; other
+   half must be copied via mkhermitian. */
+typedef struct {
+     dotens2_closure k;
+     int n;
+     int as;
+     int scalea;
+     R *ra, *ia;
+     R *rb, *ib;
+} cpyhc2_closure;
+
+static void cpyhc20(dotens2_closure *k_, 
+		    int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyhc2_closure *k = (cpyhc2_closure *)k_;
+     int i, n = k->n;
+     int scalea = k->scalea;
+     int as = k->as * scalea;
+     R *ra = k->ra + ondxa * scalea, *ia = k->ia + ondxa * scalea;
+     R *rb = k->rb + indxb, *ib = k->ib + indxb;
+     UNUSED(indxa); UNUSED(ondxb);
+
+     for (i = 0; i < n/2 + 1; ++i) {
+	  rb[2*i] = ra[as*i];
+	  ib[2*i] = ia[as*i];
+     }
+}
+
+static void cpyhc2(R *ra, R *ia,
+		   const bench_tensor *sza, const bench_tensor *vecsza,
+		   int scalea,
+		   R *rb, R *ib, const bench_tensor *szb)
+{
+     cpyhc2_closure k;
+     BENCH_ASSERT(sza->rnk <= 1);
+     k.k.apply = cpyhc20;
+     k.n = tensor_sz(sza);
+     k.scalea = scalea;
+     if (!BENCH_FINITE_RNK(sza->rnk) || sza->rnk == 0)
+	  k.as = 0;
+     else
+	  k.as = sza->dims[0].os;
+     k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
+     bench_dotens2(vecsza, szb, &k.k);
+}
+
+/* icpyhc2 is the inverse of cpyhc2 */
+
+static void icpyhc20(dotens2_closure *k_, 
+		     int indxa, int ondxa, int indxb, int ondxb)
+{
+     cpyhc2_closure *k = (cpyhc2_closure *)k_;
+     int i, n = k->n;
+     int scalea = k->scalea;
+     int as = k->as * scalea;
+     R *ra = k->ra + indxa * scalea, *ia = k->ia + indxa * scalea;
+     R *rb = k->rb + ondxb, *ib = k->ib + ondxb;
+     UNUSED(ondxa); UNUSED(indxb);
+
+     for (i = 0; i < n/2 + 1; ++i) {
+	  ra[as*i] = rb[2*i];
+	  ia[as*i] = ib[2*i];
+     }
+}
+
+static void icpyhc2(R *ra, R *ia, 
+		    const bench_tensor *sza, const bench_tensor *vecsza,
+		    int scalea,
+		    R *rb, R *ib, const bench_tensor *szb)
+{
+     cpyhc2_closure k;
+     BENCH_ASSERT(sza->rnk <= 1);
+     k.k.apply = icpyhc20;
+     k.n = tensor_sz(sza);
+     k.scalea = scalea;
+     if (!BENCH_FINITE_RNK(sza->rnk) || sza->rnk == 0)
+	  k.as = 0;
+     else
+	  k.as = sza->dims[0].is;
+     k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
+     bench_dotens2(vecsza, szb, &k.k);
+}
+
+typedef struct {
+     dofft_closure k;
+     bench_problem *p;
+} dofft_rdft2_closure;
+
+static void rdft2_apply(dofft_closure *k_, 
+			bench_complex *in, bench_complex *out)
+{
+     dofft_rdft2_closure *k = (dofft_rdft2_closure *)k_;
+     bench_problem *p = k->p;
+     bench_tensor *totalsz, *pckdsz, *totalsz_swap, *pckdsz_swap;
+     bench_tensor *probsz2, *totalsz2, *pckdsz2;
+     bench_tensor *probsz2_swap, *totalsz2_swap, *pckdsz2_swap;
+     bench_real *ri, *ii, *ro, *io;
+     int n2, totalscale;
+
+     totalsz = tensor_append(p->vecsz, p->sz);
+     pckdsz = verify_pack(totalsz, 2);
+     n2 = tensor_sz(totalsz);
+     if (BENCH_FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0)
+	  n2 = (n2 / p->sz->dims[p->sz->rnk - 1].n) * 
+	       (p->sz->dims[p->sz->rnk - 1].n / 2 + 1);
+     ri = (bench_real *) p->in;
+     ro = (bench_real *) p->out;
+
+     if (BENCH_FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0 && n2 > 0) {
+	  probsz2 = tensor_copy_sub(p->sz, p->sz->rnk - 1, 1);
+	  totalsz2 = tensor_copy_sub(totalsz, 0, totalsz->rnk - 1);
+	  pckdsz2 = tensor_copy_sub(pckdsz, 0, pckdsz->rnk - 1);
+     }
+     else {
+	  probsz2 = mktensor(0);
+	  totalsz2 = tensor_copy(totalsz);
+	  pckdsz2 = tensor_copy(pckdsz);
+     }
+
+     totalsz_swap = tensor_copy_swapio(totalsz);
+     pckdsz_swap = tensor_copy_swapio(pckdsz);
+     totalsz2_swap = tensor_copy_swapio(totalsz2);
+     pckdsz2_swap = tensor_copy_swapio(pckdsz2);
+     probsz2_swap = tensor_copy_swapio(probsz2);
+
+     /* confusion: the stride is the distance between complex elements
+	when using interleaved format, but it is the distance between
+	real elements when using split format */
+     if (p->split) {
+	  ii = p->ini ? (bench_real *) p->ini : ri + n2;
+	  io = p->outi ? (bench_real *) p->outi : ro + n2;
+	  totalscale = 1;
+     } else {
+	  ii = p->ini ? (bench_real *) p->ini : ri + 1;
+	  io = p->outi ? (bench_real *) p->outi : ro + 1;
+	  totalscale = 2;
+     }
+
+     if (p->sign < 0) { /* R2HC */
+	  int N, vN, i;
+	  cpyr(&c_re(in[0]), pckdsz, ri, totalsz);
+	  after_problem_rcopy_from(p, ri);
+	  doit(1, p);
+	  after_problem_hccopy_to(p, ro, io);
+	  if (k->k.recopy_input)
+	       cpyr(ri, totalsz_swap, &c_re(in[0]), pckdsz_swap);
+	  cpyhc2(ro, io, probsz2, totalsz2, totalscale,
+		 &c_re(out[0]), &c_im(out[0]), pckdsz2);
+	  N = tensor_sz(p->sz);
+	  vN = tensor_sz(p->vecsz);
+	  for (i = 0; i < vN; ++i)
+	       mkhermitian(out + i*N, p->sz->rnk, p->sz->dims, 1);
+     }
+     else { /* HC2R */
+	  icpyhc2(ri, ii, probsz2, totalsz2, totalscale,
+		  &c_re(in[0]), &c_im(in[0]), pckdsz2);
+	  after_problem_hccopy_from(p, ri, ii);
+	  doit(1, p);
+	  after_problem_rcopy_to(p, ro);
+	  if (k->k.recopy_input)
+	       cpyhc2(ri, ii, probsz2_swap, totalsz2_swap, totalscale,
+		      &c_re(in[0]), &c_im(in[0]), pckdsz2_swap);
+	  mkreal(out, tensor_sz(pckdsz));
+	  cpyr(ro, totalsz, &c_re(out[0]), pckdsz);
+     }
+
+     tensor_destroy(totalsz);
+     tensor_destroy(pckdsz);
+     tensor_destroy(totalsz_swap);
+     tensor_destroy(pckdsz_swap);
+     tensor_destroy(probsz2);
+     tensor_destroy(totalsz2);
+     tensor_destroy(pckdsz2);
+     tensor_destroy(probsz2_swap);
+     tensor_destroy(totalsz2_swap);
+     tensor_destroy(pckdsz2_swap);
+}
+
+void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e)
+{
+     C *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
+     int n, vecn, N;
+     dofft_rdft2_closure k;
+
+     BENCH_ASSERT(p->kind == PROBLEM_REAL);
+
+     if (!BENCH_FINITE_RNK(p->sz->rnk) || !BENCH_FINITE_RNK(p->vecsz->rnk))
+	  return;      /* give up */
+
+     k.k.apply = rdft2_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+
+     if (rounds == 0)
+	  rounds = 20;  /* default value */
+
+     n = tensor_sz(p->sz);
+     vecn = tensor_sz(p->vecsz);
+     N = n * vecn;
+
+     inA = (C *) bench_malloc(N * sizeof(C));
+     inB = (C *) bench_malloc(N * sizeof(C));
+     inC = (C *) bench_malloc(N * sizeof(C));
+     outA = (C *) bench_malloc(N * sizeof(C));
+     outB = (C *) bench_malloc(N * sizeof(C));
+     outC = (C *) bench_malloc(N * sizeof(C));
+     tmp = (C *) bench_malloc(N * sizeof(C));
+
+     e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC, 
+		    tmp, rounds, tol);
+     e->l = linear(&k.k, 1, N, inA, inB, inC, outA, outB, outC,
+		   tmp, rounds, tol);
+
+     e->s = 0.0;
+     if (p->sign < 0)
+	  e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
+				     inA, inB, outA, outB, 
+				     tmp, rounds, tol, TIME_SHIFT));
+     else
+	  e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
+				     inA, inB, outA, outB, 
+				     tmp, rounds, tol, FREQ_SHIFT));
+     
+     if (!p->in_place && !p->destroy_input)
+	  preserves_input(&k.k, p->sign < 0 ? mkreal : mkhermitian1,
+			  N, inA, inB, outB, rounds);
+
+     bench_free(tmp);
+     bench_free(outC);
+     bench_free(outB);
+     bench_free(outA);
+     bench_free(inC);
+     bench_free(inB);
+     bench_free(inA);
+}
+
+void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds,
+		    double t[6])
+{
+     dofft_rdft2_closure k;
+     int n;
+     C *a, *b;
+
+     BENCH_ASSERT(p->kind == PROBLEM_REAL);
+     BENCH_ASSERT(p->sz->rnk == 1);
+     BENCH_ASSERT(p->vecsz->rnk == 0);
+
+     k.k.apply = rdft2_apply;
+     k.k.recopy_input = 0;
+     k.p = p;
+     n = tensor_sz(p->sz);
+
+     a = (C *) bench_malloc(n * sizeof(C));
+     b = (C *) bench_malloc(n * sizeof(C));
+     accuracy_test(&k.k, p->sign < 0 ? mkreal : mkhermitian1, p->sign, 
+		   n, a, b, rounds, impulse_rounds, t);
+     bench_free(b);
+     bench_free(a);
+}
--- a/fftw-3.3.10/libbench2/verify.c
+++ b/fftw-3.3.10/libbench2/verify.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2000 Matteo Frigo
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "verify.h"
+
+void verify_problem(bench_problem *p, int rounds, double tol)
+{
+     errors e;
+     const char *pstring = p->pstring ? p->pstring : "<unknown problem>";
+
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX: verify_dft(p, rounds, tol, &e); break;
+	 case PROBLEM_REAL: verify_rdft2(p, rounds, tol, &e); break;
+	 case PROBLEM_R2R: verify_r2r(p, rounds, tol, &e); break;
+     }
+
+     if (verbose)
+	  ovtpvt("%s %g %g %g\n", pstring, e.l, e.i, e.s);
+}
+
+void verify(const char *param, int rounds, double tol)
+{
+     bench_problem *p;
+
+     p = problem_parse(param);
+     problem_alloc(p);
+
+     if (!can_do(p)) {
+	  ovtpvt_err("No can_do for %s\n", p->pstring);
+	  BENCH_ASSERT(0);
+     }
+
+     problem_zero(p);
+     setup(p);
+
+     verify_problem(p, rounds, tol);
+
+     done(p);
+     problem_destroy(p);
+}
+
+
+static void do_accuracy(bench_problem *p, int rounds, int impulse_rounds)
+{
+     double t[6];
+
+     switch (p->kind) {
+	 case PROBLEM_COMPLEX:
+	      accuracy_dft(p, rounds, impulse_rounds, t); break;
+	 case PROBLEM_REAL:
+	      accuracy_rdft2(p, rounds, impulse_rounds, t); break;
+	 case PROBLEM_R2R:
+	      accuracy_r2r(p, rounds, impulse_rounds, t); break;
+     }
+
+     /* t[0] : L1 error
+	t[1] : L2 error
+	t[2] : Linf error
+	t[3..5]: L1, L2, Linf backward error */
+     ovtpvt("%6.2e %6.2e %6.2e %6.2e %6.2e %6.2e\n", 
+	    t[0], t[1], t[2], t[3], t[4], t[5]);
+}
+
+void accuracy(const char *param, int rounds, int impulse_rounds)
+{
+     bench_problem *p;
+     p = problem_parse(param);
+     BENCH_ASSERT(can_do(p));
+     problem_alloc(p);
+     problem_zero(p);
+     setup(p);
+     do_accuracy(p, rounds, impulse_rounds);
+     done(p);
+     problem_destroy(p);
+}
--- a/fftw-3.3.10/libbench2/verify.h
+++ b/fftw-3.3.10/libbench2/verify.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "libbench2/bench.h"
+
+typedef bench_real R;
+typedef bench_complex C;
+
+typedef struct dofft_closure_s {
+     void (*apply)(struct dofft_closure_s *k,
+		   bench_complex *in, bench_complex *out);
+     int recopy_input;
+} dofft_closure;
+
+double dmax(double x, double y);
+
+typedef void (*aconstrain)(C *a, int n);
+
+void arand(C *a, int n);
+void mkreal(C *A, int n);
+void mkhermitian(C *A, int rank, const bench_iodim *dim, int stride);
+void mkhermitian1(C *a, int n);
+void aadd(C *c, C *a, C *b, int n);
+void asub(C *c, C *a, C *b, int n);
+void arol(C *b, C *a, int n, int nb, int na);
+void aphase_shift(C *b, C *a, int n, int nb, int na, double sign);
+void ascale(C *a, C alpha, int n);
+double acmp(C *a, C *b, int n, const char *test, double tol);
+double mydrand(void);
+double impulse(dofft_closure *k,
+	       int n, int vecn, 
+	       C *inA, C *inB, C *inC,
+	       C *outA, C *outB, C *outC,
+	       C *tmp, int rounds, double tol);
+double linear(dofft_closure *k, int realp,
+	      int n, C *inA, C *inB, C *inC, C *outA,
+	      C *outB, C *outC, C *tmp, int rounds, double tol);
+void preserves_input(dofft_closure *k, aconstrain constrain,
+                     int n, C *inA, C *inB, C *outB, int rounds);
+
+enum { TIME_SHIFT, FREQ_SHIFT };
+double tf_shift(dofft_closure *k, int realp, const bench_tensor *sz,
+		int n, int vecn, double sign,
+		C *inA, C *inB, C *outA, C *outB, C *tmp,
+		int rounds, double tol, int which_shift);
+
+typedef struct dotens2_closure_s {
+     void (*apply)(struct dotens2_closure_s *k, 
+		   int indx0, int ondx0, int indx1, int ondx1);
+} dotens2_closure;
+
+void bench_dotens2(const bench_tensor *sz0, 
+		   const bench_tensor *sz1, dotens2_closure *k);
+
+void accuracy_test(dofft_closure *k, aconstrain constrain,
+		   int sign, int n, C *a, C *b, int rounds, int impulse_rounds,
+		   double t[6]);
+
+void accuracy_dft(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6]);
+void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds,
+		    double t[6]);
+void accuracy_r2r(bench_problem *p, int rounds, int impulse_rounds,
+		  double t[6]);
+
+#if defined(BENCHFFT_LDOUBLE) && HAVE_COSL
+   typedef long double trigreal;
+#  define COS cosl
+#  define SIN sinl
+#  define TAN tanl
+#  define KTRIG(x) (x##L)
+#elif defined(BENCHFFT_QUAD) && HAVE_LIBQUADMATH
+   typedef __float128 trigreal;
+#  define COS cosq
+#  define SIN sinq
+#  define TAN tanq
+#  define KTRIG(x) (x##Q)
+extern trigreal cosq(trigreal);
+extern trigreal sinq(trigreal);
+extern trigreal tanq(trigreal);
+#else
+   typedef double trigreal;
+#  define COS cos
+#  define SIN sin
+#  define TAN tan
+#  define KTRIG(x) (x)
+#endif
+#define K2PI KTRIG(6.2831853071795864769252867665590057683943388)
--- a/fftw-3.3.10/libbench2/zero.c
+++ b/fftw-3.3.10/libbench2/zero.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2001 Matteo Frigo
+ * Copyright (c) 2001 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "libbench2/bench.h"
+
+/* set I/O arrays to zero.  Default routine */
+void problem_zero(bench_problem *p)
+{
+     bench_complex czero = {0, 0};
+     if (p->kind == PROBLEM_COMPLEX) {
+	  caset((bench_complex *) p->inphys, p->iphyssz, czero);
+	  caset((bench_complex *) p->outphys, p->ophyssz, czero);
+     } else if (p->kind == PROBLEM_R2R) {
+	  aset((bench_real *) p->inphys, p->iphyssz, 0.0);
+	  aset((bench_real *) p->outphys, p->ophyssz, 0.0);
+     } else if (p->kind == PROBLEM_REAL && p->sign < 0) {
+	  aset((bench_real *) p->inphys, p->iphyssz, 0.0);
+	  caset((bench_complex *) p->outphys, p->ophyssz, czero);
+     } else if (p->kind == PROBLEM_REAL && p->sign > 0) {
+	  caset((bench_complex *) p->inphys, p->iphyssz, czero);
+	  aset((bench_real *) p->outphys, p->ophyssz, 0.0);
+     } else {
+	  BENCH_ASSERT(0); /* TODO */
+     }
+}