Updates

2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions
--- a/fftw-3.3.10/mpi/Makefile.am
+++ b/fftw-3.3.10/mpi/Makefile.am
@@ -0,0 +1,101 @@
+# -I $(top_srcdir)/api is necessary because fftw3-mpi.h includes
+# "fftw3.h", and we cannot change the latter to "api/fftw3.h" because
+# fftw3-mpi.h is installed in /usr/include.
+AM_CPPFLAGS = -I $(top_srcdir) -I $(top_srcdir)/api
+
+if MPI
+lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_mpi.la
+include_HEADERS = fftw3-mpi.h
+nodist_include_HEADERS = fftw3-mpi.f03 fftw3l-mpi.f03
+noinst_PROGRAMS = mpi-bench
+endif
+
+CC=@MPICC@
+
+EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in fftw3l-mpi.f03.in
+BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
+CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
+
+TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
+DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
+RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
+RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
+SRC = any-true.c api.c block.c choose-radix.c conf.c dtensor.c fftw3-mpi.h ifftw-mpi.h rearrange.c wisdom-api.c f03-wrap.c
+
+libfftw3@PREC_SUFFIX@_mpi_la_SOURCES = $(SRC) $(TRANSPOSE_SRC) $(DFT_SRC) $(RDFT_SRC) $(RDFT2_SRC)
+
+libfftw3@PREC_SUFFIX@_mpi_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
+libfftw3@PREC_SUFFIX@_mpi_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la @MPILIBS@
+
+if THREADS
+mpi_bench_CFLAGS = $(PTHREAD_CFLAGS)
+if !COMBINED_THREADS
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
+endif
+else
+if OPENMP
+mpi_bench_CFLAGS = $(OPENMP_CFLAGS)
+LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
+endif
+endif
+
+mpi_bench_SOURCES = mpi-bench.c $(top_srcdir)/tests/fftw-bench.c $(top_srcdir)/tests/hook.c
+mpi_bench_LDADD = libfftw3@PREC_SUFFIX@_mpi.la $(LIBFFTWTHREADS) $(top_builddir)/libfftw3@PREC_SUFFIX@.la $(top_builddir)/libbench2/libbench2.a $(MPILIBS) $(THREADLIBS)
+
+CHECK = $(top_srcdir)/tests/check.pl
+NUMCHECK=10
+CHECKSIZE=10000
+CHECKOPTS = --verbose --random --maxsize=$(CHECKSIZE) -c=$(NUMCHECK) $(CHECK_PL_OPTS)
+
+if MPI
+
+check-local: mpi-bench$(EXEEXT)
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 1 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "     MPI FFTW transforms passed "$(NUMCHECK)" tests, 1 CPU"
+	@echo "--------------------------------------------------------------"
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 2 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 2 CPUs"
+	@echo "--------------------------------------------------------------"
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 3 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 3 CPUs"
+	@echo "--------------------------------------------------------------"
+	perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 4 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW transforms passed "$(NUMCHECK)" tests, 4 CPUs"
+	@echo "--------------------------------------------------------------"
+if SMP
+	perl -w $(CHECK) $(CHECKOPTS) --mpi --nthreads=2 "$(MPIRUN) -np 3 `pwd`/mpi-bench"
+	@echo "--------------------------------------------------------------"
+	@echo "      MPI FFTW threaded transforms passed "$(NUMCHECK)" tests!"
+	@echo "--------------------------------------------------------------"
+endif
+
+bigcheck: mpi-bench$(EXEEXT)
+	$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=100 CHECKSIZE=60000 check-local
+
+smallcheck: mpi-bench$(EXEEXT)
+	$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=2 check-local
+
+endif
+
+fftw3-mpi.f03: fftw3-mpi.f03.in Makefile
+	sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3-mpi.f03.in > $@
+
+fftw3l-mpi.f03: fftw3l-mpi.f03.in Makefile
+	sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3l-mpi.f03.in > $@
+
+if MAINTAINER_MODE
+
+fftw3-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
+	sh $(srcdir)/f03api.sh d f > $@
+
+fftw3l-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
+	sh $(srcdir)/f03api.sh l | grep -v parameter | sed 's/fftw3.f03/fftw3l.f03/' > $@
+
+f03-wrap.c: fftw3-mpi.h f03-wrap.sh genf03-wrap.pl
+	sh $(srcdir)/f03-wrap.sh > $@
+
+endif
--- a/fftw-3.3.10/mpi/Makefile.in
+++ b/fftw-3.3.10/mpi/Makefile.in
--- a/fftw-3.3.10/mpi/any-true.c
+++ b/fftw-3.3.10/mpi/any-true.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* During planning, if any process fails to create a plan then
+   all of the processes must fail.  This synchronization is implemented
+   by the following routine.
+
+   Instead of 
+        if (failure) goto nada;
+   we instead do:
+        if (any_true(failure, comm)) goto nada;
+*/
+
+int XM(any_true)(int condition, MPI_Comm comm)
+{
+     int result;
+     MPI_Allreduce(&condition, &result, 1, MPI_INT, MPI_LOR, comm);
+     return result;
+}
+
+/***********************************************************************/
+
+#if defined(FFTW_DEBUG)
+/* for debugging, we include an assertion to make sure that
+   MPI problems all produce equal hashes, as checked by this routine: */
+
+int XM(md5_equal)(md5 m, MPI_Comm comm)
+{
+     unsigned long s0[4];
+     int i, eq_me, eq_all;
+
+     X(md5end)(&m);
+     for (i = 0; i < 4; ++i) s0[i] = m.s[i];
+     MPI_Bcast(s0, 4, MPI_UNSIGNED_LONG, 0, comm);
+     for (i = 0; i < 4 && s0[i] == m.s[i]; ++i) ;
+     eq_me = i == 4;
+     MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
+     return eq_all;
+}
+#endif
--- a/fftw-3.3.10/mpi/api.c
+++ b/fftw-3.3.10/mpi/api.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "api/api.h"
+#include "fftw3-mpi.h"
+#include "ifftw-mpi.h"
+#include "mpi-transpose.h"
+#include "mpi-dft.h"
+#include "mpi-rdft.h"
+#include "mpi-rdft2.h"
+
+/* Convert API flags to internal MPI flags. */
+#define MPI_FLAGS(f) ((f) >> 27)
+
+/*************************************************************************/
+
+static int mpi_inited = 0;
+
+static MPI_Comm problem_comm(const problem *p) {
+     switch (p->adt->problem_kind) {
+	 case PROBLEM_MPI_DFT:
+	      return ((const problem_mpi_dft *) p)->comm;
+	 case PROBLEM_MPI_RDFT:
+	      return ((const problem_mpi_rdft *) p)->comm;
+	 case PROBLEM_MPI_RDFT2:
+	      return ((const problem_mpi_rdft2 *) p)->comm;
+	 case PROBLEM_MPI_TRANSPOSE:
+	      return ((const problem_mpi_transpose *) p)->comm;
+	 default:
+	      return MPI_COMM_NULL;
+     }
+}
+
+/* used to synchronize cost measurements (timing or estimation)
+   across all processes for an MPI problem, which is critical to
+   ensure that all processes decide to use the same MPI plans
+   (whereas serial plans need not be syncronized). */
+static double cost_hook(const problem *p, double t, cost_kind k)
+{
+     MPI_Comm comm = problem_comm(p);
+     double tsum;
+     if (comm == MPI_COMM_NULL) return t;
+     MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE, 
+		   k == COST_SUM ? MPI_SUM : MPI_MAX, comm);
+     return tsum;
+}
+
+/* Used to reject wisdom that is not in sync across all processes
+   for an MPI problem, which is critical to ensure that all processes
+   decide to use the same MPI plans.  (Even though costs are synchronized,
+   above, out-of-sync wisdom may result from plans being produced
+   by communicators that do not span all processes, either from a
+   user-specified communicator or e.g. from transpose-recurse. */
+static int wisdom_ok_hook(const problem *p, flags_t flags)
+{
+     MPI_Comm comm = problem_comm(p);
+     int eq_me, eq_all;
+     /* unpack flags bitfield, since MPI communications may involve
+	byte-order changes and MPI cannot do this for bit fields */
+#if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */
+     unsigned int f[5];
+#else
+     unsigned long f[5]; /* at least 32 bits as per C standard */
+#endif
+
+     if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */
+
+     if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */
+
+     /* otherwise, check that the flags and solver index are identical
+	on all processes in this problem's communicator.
+
+	TO DO: possibly we can relax strict equality, but it is
+	critical to ensure that any flags which affect what plan is
+	created (and whether the solver is applicable) are the same,
+	e.g. DESTROY_INPUT, NO_UGLY, etcetera.  (If the MPI algorithm
+	differs between processes, deadlocks/crashes generally result.) */
+     f[0] = flags.l;
+     f[1] = flags.hash_info;
+     f[2] = flags.timelimit_impatience;
+     f[3] = flags.u;
+     f[4] = flags.slvndx;
+     MPI_Bcast(f, 5, 
+	       SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG,
+	       0, comm);
+     eq_me = f[0] == flags.l && f[1] == flags.hash_info
+	  && f[2] == flags.timelimit_impatience
+	  && f[3] == flags.u && f[4] == flags.slvndx;
+     MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
+     return eq_all;
+}
+
+/* This hook is called when wisdom is not found.  The any_true here
+   matches up with the any_true in wisdom_ok_hook, in order to handle
+   the case where some processes had wisdom (and called wisdom_ok_hook)
+   and some processes didn't have wisdom (and called nowisdom_hook). */
+static void nowisdom_hook(const problem *p)
+{
+     MPI_Comm comm = problem_comm(p);
+     if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */
+     XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */
+}
+
+/* needed to synchronize planner bogosity flag, in case non-MPI problems
+   on a subset of processes encountered bogus wisdom */
+static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p)
+{
+     MPI_Comm comm = problem_comm(p);
+     if (comm != MPI_COMM_NULL /* an MPI problem */
+	 && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */
+	  return WISDOM_IS_BOGUS;
+     return state;
+}
+
+void XM(init)(void)
+{
+     if (!mpi_inited) {
+	  planner *plnr = X(the_planner)();
+	  plnr->cost_hook = cost_hook;
+	  plnr->wisdom_ok_hook = wisdom_ok_hook;
+	  plnr->nowisdom_hook = nowisdom_hook;
+	  plnr->bogosity_hook = bogosity_hook;
+          XM(conf_standard)(plnr);
+	  mpi_inited = 1;	  
+     }
+}
+
+void XM(cleanup)(void)
+{
+     X(cleanup)();
+     mpi_inited = 0;
+}
+
+/*************************************************************************/
+
+static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0)
+{
+     dtensor *x = XM(mkdtensor)(rnk);
+     int i;
+     for (i = 0; i < rnk; ++i) {
+	  x->dims[i].n = dims0[i].n;
+	  x->dims[i].b[IB] = dims0[i].ib;
+	  x->dims[i].b[OB] = dims0[i].ob;
+     }
+     return x;
+}
+
+static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes,
+			   int rdft2)
+{
+     dtensor *sz = XM(mkdtensor)(rnk);
+     dtensor *sz0 = mkdtensor_api(rnk, dims0);
+     block_kind k;
+     int i;
+
+     for (i = 0; i < rnk; ++i)
+	  sz->dims[i].n = dims0[i].n;
+
+     if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
+
+     for (i = 0; i < rnk; ++i) {
+	  sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n;
+	  sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n;
+     }
+
+     /* If we haven't used all of the processes yet, and some of the
+	block sizes weren't specified (i.e. 0), then set the
+	unspecified blocks so as to use as many processes as
+	possible with as few distributed dimensions as possible. */
+     FORALL_BLOCK_KIND(k) {
+	  INT nb = XM(num_blocks_total)(sz, k);
+	  INT np = n_pes / nb;
+	  for (i = 0; i < rnk && np > 1; ++i)
+	       if (!sz0->dims[i].b[k]) {
+		    sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np);
+		    nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]);
+		    np = n_pes / nb;
+	       }
+     }
+
+     if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n;
+
+     /* punt for 1d prime */
+     if (rnk == 1 && X(is_prime)(sz->dims[0].n))
+	  sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n;
+
+     XM(dtensor_destroy)(sz0);
+     sz0 = XM(dtensor_canonical)(sz, 0);
+     XM(dtensor_destroy)(sz);
+     return sz0;
+}
+
+/* allocate simple local (serial) dims array corresponding to n[rnk] */
+static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n)
+{
+     XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk,
+						TENSORS);
+     int i;
+     for (i = 0; i < rnk; ++i)
+	  dims[i].n = dims[i].ib = dims[i].ob = n[i];
+     return dims;
+}
+
+/*************************************************************************/
+
+static void local_size(int my_pe, const dtensor *sz, block_kind k,
+		       ptrdiff_t *local_n, ptrdiff_t *local_start)
+{
+     int i;
+     if (my_pe >= XM(num_blocks_total)(sz, k))
+	  for (i = 0; i < sz->rnk; ++i)
+	       local_n[i] = local_start[i] = 0;
+     else {
+	  XM(block_coords)(sz, k, my_pe, local_start);
+	  for (i = 0; i < sz->rnk; ++i) {
+	       local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k],
+				      local_start[i]);
+	       local_start[i] *= sz->dims[i].b[k];
+	  }
+     }
+}
+
+static INT prod(int rnk, const ptrdiff_t *local_n) 
+{
+     int i;
+     INT N = 1;
+     for (i = 0; i < rnk; ++i) N *= local_n[i];
+     return N;
+}
+
+ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0,
+			      ptrdiff_t howmany, MPI_Comm comm,
+			      ptrdiff_t *local_n_in,
+			      ptrdiff_t *local_start_in,
+			      ptrdiff_t *local_n_out, 
+			      ptrdiff_t *local_start_out,
+			      int sign, unsigned flags)
+{
+     INT N;
+     int my_pe, n_pes, i;
+     dtensor *sz;
+
+     if (rnk == 0)
+	  return howmany;
+
+     MPI_Comm_rank(comm, &my_pe);
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 0);
+
+     /* Now, we must figure out how much local space the user should
+	allocate (or at least an upper bound).  This depends strongly
+	on the exact algorithms we employ...ugh!  FIXME: get this info
+	from the solvers somehow? */
+     N = 1; /* never return zero allocation size */
+     if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) {
+	  INT Nafter;
+	  ddim odims[2];
+
+	  /* dft-rank-geq2-transposed */
+	  odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */
+	  /* we may need extra space for transposed intermediate data */
+	  for (i = 0; i < 2; ++i)
+	       if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 &&
+		   XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) {
+		    sz->dims[i].b[IB]
+			 = XM(default_block)(sz->dims[i].n, n_pes);
+		    sz->dims[1-i].b[IB] = sz->dims[1-i].n;
+		    local_size(my_pe, sz, IB, local_n_in, local_start_in);
+		    N = X(imax)(N, prod(rnk, local_n_in));
+		    sz->dims[i] = odims[i];
+		    sz->dims[1-i] = odims[1-i];
+		    break;
+	       }
+
+	  /* dft-rank-geq2 */
+	  Nafter = howmany;
+	  for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
+	  N = X(imax)(N, (sz->dims[0].n
+			  * XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
+				      my_pe) + howmany - 1) / howmany);
+
+	  /* dft-rank-geq2 with dimensions swapped */
+	  Nafter = howmany * sz->dims[0].n;
+          for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
+          N = X(imax)(N, (sz->dims[1].n
+                          * XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
+                                      my_pe) + howmany - 1) / howmany);
+     }
+     else if (rnk == 1) {
+	  if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */
+	       ptrdiff_t n[2], start[2];
+	       dtensor *sz2 = XM(mkdtensor)(2);
+	       sz2->dims[0] = sz->dims[0];
+	       sz2->dims[0].b[IB] = sz->dims[0].n;
+	       sz2->dims[1].n = sz2->dims[1].b[OB] = howmany;
+	       sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes);
+	       local_size(my_pe, sz2, IB, n, start);
+	       XM(dtensor_destroy)(sz2);
+	       N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany);
+	  }
+	  else { /* dft-rank1 */
+	       INT r, m, rblock[2], mblock[2];
+
+	       /* Since the 1d transforms are so different, we require
+		  the user to call local_size_1d for this case.  Ugh. */
+	       CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD);
+
+	       if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign,
+					 rblock, mblock))) {
+		    m = sz->dims[0].n / r;
+		    if (flags & FFTW_MPI_SCRAMBLED_IN)
+			 sz->dims[0].b[IB] = rblock[IB] * m;
+		    else { /* !SCRAMBLED_IN */
+			 sz->dims[0].b[IB] = r * mblock[IB];
+			 N = X(imax)(N, rblock[IB] * m);
+		    }
+		    if (flags & FFTW_MPI_SCRAMBLED_OUT)
+			 sz->dims[0].b[OB] = r * mblock[OB];
+		    else { /* !SCRAMBLED_OUT */
+			 N = X(imax)(N, r * mblock[OB]);
+			 sz->dims[0].b[OB] = rblock[OB] * m;
+		    }
+	       }
+	  }
+     }
+
+     local_size(my_pe, sz, IB, local_n_in, local_start_in);
+     local_size(my_pe, sz, OB, local_n_out, local_start_out);
+
+     /* at least, make sure we have enough space to store input & output */
+     N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out)));
+
+     XM(dtensor_destroy)(sz);
+     return N * howmany;
+}
+
+ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n,
+					 ptrdiff_t howmany,
+					 ptrdiff_t xblock, ptrdiff_t yblock,
+					 MPI_Comm comm,
+					 ptrdiff_t *local_nx,
+					 ptrdiff_t *local_x_start,
+					 ptrdiff_t *local_ny,
+					 ptrdiff_t *local_y_start)
+{
+     ptrdiff_t N;
+     XM(ddim) *dims; 
+     ptrdiff_t *local;
+
+     if (rnk == 0) {
+	  *local_nx = *local_ny = 1;
+	  *local_x_start = *local_y_start = 0;
+	  return howmany;
+     }
+
+     dims = simple_dims(rnk, n);
+     local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS);
+
+     /* default 1d block distribution, with transposed output
+        if yblock < n[1] */
+     dims[0].ib = xblock;
+     if (rnk > 1) {
+	  if (yblock < n[1])
+	       dims[1].ob = yblock;
+	  else
+	       dims[0].ob = xblock;
+     }
+     else
+	  dims[0].ob = xblock; /* FIXME: 1d not really supported here 
+				         since we don't have flags/sign */
+     
+     N = XM(local_size_guru)(rnk, dims, howmany, comm, 
+			     local, local + rnk,
+			     local + 2*rnk, local + 3*rnk,
+			     0, 0);
+     *local_nx = local[0];
+     *local_x_start = local[rnk];
+     if (rnk > 1) {
+	  *local_ny = local[2*rnk + 1];
+	  *local_y_start = local[3*rnk + 1];
+     }
+     else {
+	  *local_ny = *local_nx;
+	  *local_y_start = *local_x_start;
+     }
+     X(ifree)(local);
+     X(ifree)(dims);
+     return N;
+}
+
+ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n,
+			      ptrdiff_t howmany, 
+			      ptrdiff_t xblock,
+			      MPI_Comm comm,
+			      ptrdiff_t *local_nx,
+			      ptrdiff_t *local_x_start)
+{
+     ptrdiff_t local_ny, local_y_start;
+     return XM(local_size_many_transposed)(rnk, n, howmany,
+					   xblock, rnk > 1 
+					   ? n[1] : FFTW_MPI_DEFAULT_BLOCK,
+					   comm,
+					   local_nx, local_x_start,
+					   &local_ny, &local_y_start);
+}
+
+
+ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n,
+				    MPI_Comm comm,
+				    ptrdiff_t *local_nx,
+				    ptrdiff_t *local_x_start,
+				    ptrdiff_t *local_ny,
+				    ptrdiff_t *local_y_start)
+{
+     return XM(local_size_many_transposed)(rnk, n, 1,
+					   FFTW_MPI_DEFAULT_BLOCK,
+					   FFTW_MPI_DEFAULT_BLOCK,
+					   comm,
+					   local_nx, local_x_start,
+					   local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n,
+			 MPI_Comm comm,
+			 ptrdiff_t *local_nx,
+			 ptrdiff_t *local_x_start)
+{
+     return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm,
+				local_nx, local_x_start);
+}
+
+ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany, 
+				 MPI_Comm comm, int sign, unsigned flags,
+				 ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
+				 ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
+{
+     XM(ddim) d;
+     d.n = nx;
+     d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK;
+     return XM(local_size_guru)(1, &d, howmany, comm,
+				local_nx, local_x_start,
+				local_ny, local_y_start, sign, flags);
+}
+
+ptrdiff_t XM(local_size_1d)(ptrdiff_t nx,
+			    MPI_Comm comm, int sign, unsigned flags,
+			    ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
+			    ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
+{
+     return XM(local_size_many_1d)(nx, 1, comm, sign, flags,
+				   local_nx, local_x_start,
+				   local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
+				       MPI_Comm comm,
+				       ptrdiff_t *local_nx,
+				       ptrdiff_t *local_x_start,
+				       ptrdiff_t *local_ny, 
+				       ptrdiff_t *local_y_start)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(local_size_transposed)(2, n, comm,
+				      local_nx, local_x_start,
+				      local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm,
+			       ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(local_size)(2, n, comm, local_nx, local_x_start);
+}
+
+ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
+				       ptrdiff_t nz,
+				       MPI_Comm comm,
+				       ptrdiff_t *local_nx,
+				       ptrdiff_t *local_x_start,
+				       ptrdiff_t *local_ny, 
+				       ptrdiff_t *local_y_start)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(local_size_transposed)(3, n, comm,
+				      local_nx, local_x_start,
+				      local_ny, local_y_start);
+}
+
+ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			    MPI_Comm comm,
+			    ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(local_size)(3, n, comm, local_nx, local_x_start);
+}
+
+/*************************************************************************/
+/* Transpose API */
+
+X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny, 
+				ptrdiff_t howmany,
+				ptrdiff_t xblock, ptrdiff_t yblock,
+				R *in, R *out, 
+				MPI_Comm comm, unsigned flags)
+{
+     int n_pes;
+     XM(init)();
+
+     if (howmany < 0 || xblock < 0 || yblock < 0 ||
+	 nx <= 0 || ny <= 0) return 0;
+
+     MPI_Comm_size(comm, &n_pes);
+     if (!xblock) xblock = XM(default_block)(nx, n_pes);
+     if (!yblock) yblock = XM(default_block)(ny, n_pes);
+     if (n_pes < XM(num_blocks)(nx, xblock)
+	 || n_pes < XM(num_blocks)(ny, yblock))
+	  return 0;
+
+     return 
+	  X(mkapiplan)(FFTW_FORWARD, flags,
+		       XM(mkproblem_transpose)(nx, ny, howmany,
+					       in, out, xblock, yblock,
+					       comm, MPI_FLAGS(flags)));
+}
+
+X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, 
+			   MPI_Comm comm, unsigned flags)
+			      
+{
+     return XM(plan_many_transpose)(nx, ny, 1,
+				    FFTW_MPI_DEFAULT_BLOCK,
+				    FFTW_MPI_DEFAULT_BLOCK,
+				    in, out, comm, flags);
+}
+
+/*************************************************************************/
+/* Complex DFT API */
+
+X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0,
+			  ptrdiff_t howmany,
+			  C *in, C *out,
+			  MPI_Comm comm, int sign, unsigned flags)
+{
+     int n_pes, i;
+     dtensor *sz;
+     
+     XM(init)();
+
+     if (howmany < 0 || rnk < 1) return 0;
+     for (i = 0; i < rnk; ++i)
+	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
+	       return 0;
+
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 0);
+
+     if (XM(num_blocks_total)(sz, IB) > n_pes
+	 || XM(num_blocks_total)(sz, OB) > n_pes) {
+	  XM(dtensor_destroy)(sz);
+	  return 0;
+     }
+
+     return
+          X(mkapiplan)(sign, flags,
+                       XM(mkproblem_dft_d)(sz, howmany,
+					   (R *) in, (R *) out,
+					   comm, sign, 
+					   MPI_FLAGS(flags)));
+}
+
+X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  C *in, C *out,
+			  MPI_Comm comm, int sign, unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out,
+		     MPI_Comm comm, int sign, unsigned flags)
+{
+     return XM(plan_many_dft)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, sign, flags);
+}
+
+X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out,
+			MPI_Comm comm, int sign, unsigned flags)
+{
+     return XM(plan_dft)(1, &nx, in, out, comm, sign, flags);
+}
+
+X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out,
+			MPI_Comm comm, int sign, unsigned flags)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(plan_dft)(2, n, in, out, comm, sign, flags);
+}
+
+X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			C *in, C *out,
+			MPI_Comm comm, int sign, unsigned flags)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(plan_dft)(3, n, in, out, comm, sign, flags);
+}
+
+/*************************************************************************/
+/* R2R API */
+
+X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0,
+			  ptrdiff_t howmany,
+			  R *in, R *out,
+			  MPI_Comm comm, const X(r2r_kind) *kind,
+			  unsigned flags)
+{
+     int n_pes, i;
+     dtensor *sz;
+     rdft_kind *k;
+     X(plan) pln;
+     
+     XM(init)();
+
+     if (howmany < 0 || rnk < 1) return 0;
+     for (i = 0; i < rnk; ++i)
+	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
+	       return 0;
+
+     k = X(map_r2r_kind)(rnk, kind);
+
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 0);
+
+     if (XM(num_blocks_total)(sz, IB) > n_pes
+	 || XM(num_blocks_total)(sz, OB) > n_pes) {
+	  XM(dtensor_destroy)(sz);
+	  return 0;
+     }
+
+     pln = X(mkapiplan)(0, flags,
+			XM(mkproblem_rdft_d)(sz, howmany,
+					     in, out,
+					     comm, k, MPI_FLAGS(flags)));
+     X(ifree0)(k);
+     return pln;
+}
+
+X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  R *in, R *out,
+			  MPI_Comm comm, const X(r2r_kind) *kind,
+			  unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out,
+		     MPI_Comm comm, 
+		     const X(r2r_kind) *kind,
+		     unsigned flags)
+{
+     return XM(plan_many_r2r)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, kind, flags);
+}
+
+X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
+			MPI_Comm comm,
+			X(r2r_kind) kindx, X(r2r_kind) kindy,
+			unsigned flags)
+{
+     ptrdiff_t n[2];
+     X(r2r_kind) kind[2];
+     n[0] = nx; n[1] = ny;
+     kind[0] = kindx; kind[1] = kindy;
+     return XM(plan_r2r)(2, n, in, out, comm, kind, flags);
+}
+
+X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			R *in, R *out,
+			MPI_Comm comm, 
+			X(r2r_kind) kindx, X(r2r_kind) kindy,
+			X(r2r_kind) kindz,
+			unsigned flags)
+{
+     ptrdiff_t n[3];
+     X(r2r_kind) kind[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     kind[0] = kindx; kind[1] = kindy; kind[2] = kindz;
+     return XM(plan_r2r)(3, n, in, out, comm, kind, flags);
+}
+
+/*************************************************************************/
+/* R2C/C2R API */
+
+static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0,
+			       ptrdiff_t howmany,
+			       R *r, C *c,
+			       MPI_Comm comm, rdft_kind kind, unsigned flags)
+{
+     int n_pes, i;
+     dtensor *sz;
+     R *cr = (R *) c;
+     
+     XM(init)();
+
+     if (howmany < 0 || rnk < 2) return 0;
+     for (i = 0; i < rnk; ++i)
+	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
+	       return 0;
+
+     MPI_Comm_size(comm, &n_pes);
+     sz = default_sz(rnk, dims0, n_pes, 1);
+
+     sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
+     if (XM(num_blocks_total)(sz, IB) > n_pes
+	 || XM(num_blocks_total)(sz, OB) > n_pes) {
+	  XM(dtensor_destroy)(sz);
+	  return 0;
+     }
+     sz->dims[rnk-1].n = dims0[rnk-1].n;
+
+     if (kind == R2HC)
+	  return X(mkapiplan)(0, flags,
+			      XM(mkproblem_rdft2_d)(sz, howmany,
+						    r, cr, comm, R2HC, 
+						    MPI_FLAGS(flags)));
+     else
+	  return X(mkapiplan)(0, flags,
+			      XM(mkproblem_rdft2_d)(sz, howmany,
+						    cr, r, comm, HC2R, 
+						    MPI_FLAGS(flags)));
+}
+
+X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  R *in, C *out,
+			  MPI_Comm comm, unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n,
+			  ptrdiff_t howmany,
+			  ptrdiff_t iblock, ptrdiff_t oblock,
+			  C *in, R *out,
+			  MPI_Comm comm, unsigned flags)
+{
+     XM(ddim) *dims = simple_dims(rnk, n);
+     X(plan) pln;
+
+     if (rnk == 1) {
+	  dims[0].ib = iblock;
+	  dims[0].ob = oblock;
+     }
+     else if (rnk > 1) {
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
+	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
+     }
+
+     pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags);
+     X(ifree)(dims);
+     return pln;
+}
+
+X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out,
+		     MPI_Comm comm, unsigned flags)
+{
+     return XM(plan_many_dft_r2c)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out,
+			MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(plan_dft_r2c)(2, n, in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			R *in, C *out, MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(plan_dft_r2c)(3, n, in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out,
+		     MPI_Comm comm, unsigned flags)
+{
+     return XM(plan_many_dft_c2r)(rnk, n, 1,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      FFTW_MPI_DEFAULT_BLOCK,
+			      in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out,
+			MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[2];
+     n[0] = nx; n[1] = ny;
+     return XM(plan_dft_c2r)(2, n, in, out, comm, flags);
+}
+
+X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
+			C *in, R *out, MPI_Comm comm, unsigned flags)
+{
+     ptrdiff_t n[3];
+     n[0] = nx; n[1] = ny; n[2] = nz;
+     return XM(plan_dft_c2r)(3, n, in, out, comm, flags);
+}
+
+/*************************************************************************/
+/* New-array execute functions */
+
+void XM(execute_dft)(const X(plan) p, C *in, C *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, (R*) in, (R*) out);
+}
+
+void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, in, (R*) out);
+}
+
+void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, (R*) in, out);
+}
+
+void XM(execute_r2r)(const X(plan) p, R *in, R *out) {
+     /* internally, MPI plans are just rdft plans */
+     X(execute_r2r)(p, in, out);
+}
--- a/fftw-3.3.10/mpi/block.c
+++ b/fftw-3.3.10/mpi/block.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+INT XM(num_blocks)(INT n, INT block)
+{
+     return (n + block - 1) / block;
+}
+
+int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm)
+{
+     int n_pes;
+     MPI_Comm_size(comm, &n_pes);
+     return n_pes >= XM(num_blocks)(n, block);
+}
+
+/* Pick a default block size for dividing a problem of size n among
+   n_pes processes.  Divide as equally as possible, while minimizing
+   the maximum block size among the processes as well as the number of
+   processes with nonzero blocks. */
+INT XM(default_block)(INT n, int n_pes)
+{
+     return ((n + n_pes - 1) / n_pes);
+}
+
+/* For a given block size and dimension n, compute the block size 
+   on the given process. */
+INT XM(block)(INT n, INT block, int which_block)
+{
+     INT d = n - which_block * block;
+     return d <= 0 ? 0 : (d > block ? block : d);
+}
+
+static INT num_blocks_kind(const ddim *dim, block_kind k)
+{
+     return XM(num_blocks)(dim->n, dim->b[k]);
+}
+
+INT XM(num_blocks_total)(const dtensor *sz, block_kind k)
+{
+     if (FINITE_RNK(sz->rnk)) {
+	  int i;
+	  INT ntot = 1;
+	  for (i = 0; i < sz->rnk; ++i)
+	       ntot *= num_blocks_kind(sz->dims + i, k);
+	  return ntot;
+     }
+     else
+	  return 0;
+}
+
+int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe)
+{
+     return (which_pe >= XM(num_blocks_total)(sz, k));
+}
+
+/* Given a non-idle process which_pe, computes the coordinate
+   vector coords[rnk] giving the coordinates of a block in the
+   matrix of blocks.  k specifies whether we are talking about
+   the input or output data distribution. */
+void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, 
+		     INT *coords)
+{
+     int i;
+     A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk));
+     for (i = sz->rnk - 1; i >= 0; --i) {
+	  INT nb = num_blocks_kind(sz->dims + i, k);
+	  coords[i] = which_pe % nb;
+	  which_pe /= nb;
+     }
+}
+
+INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe)
+{
+     if (XM(idle_process)(sz, k, which_pe))
+	  return 0;
+     else {
+	  int i;
+	  INT N = 1, *coords;
+	  STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk);
+	  XM(block_coords)(sz, k, which_pe, coords);
+	  for (i = 0; i < sz->rnk; ++i)
+	       N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]);
+	  STACK_FREE(coords);
+	  return N;
+     }
+}
+
+/* returns whether sz is local for dims >= dim */
+int XM(is_local_after)(int dim, const dtensor *sz, block_kind k)
+{
+     if (FINITE_RNK(sz->rnk))
+	  for (; dim < sz->rnk; ++dim)
+	       if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1)
+		    return 0;
+     return 1;
+}
+
+int XM(is_local)(const dtensor *sz, block_kind k)
+{
+     return XM(is_local_after)(0, sz, k);
+}
+
+/* Return whether sz is distributed for k according to a simple
+   1d block distribution in the first or second dimensions */
+int XM(is_block1d)(const dtensor *sz, block_kind k)
+{
+     int i;
+     if (!FINITE_RNK(sz->rnk)) return 0;
+     for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ;
+     return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k));
+
+}
--- a/fftw-3.3.10/mpi/choose-radix.c
+++ b/fftw-3.3.10/mpi/choose-radix.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* Return the radix r for a 1d MPI transform of a distributed dimension d,
+   with the given flags and transform size.   That is, decomposes d.n
+   as r * m, Cooley-Tukey style.  Also computes the block sizes rblock
+   and mblock.  Returns 0 if such a decomposition is not feasible.
+   This is unfortunately somewhat complicated.
+
+   A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
+
+   d.n is initially distributed as an m x r array with block size mblock[IB].
+   Then it is internally transposed to an r x m array with block size
+   rblock[IB].  Then it is internally transposed to m x r again with block
+   size mblock[OB].  Finally, it is transposed to r x m with block size
+   rblock[IB].
+
+   If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
+   starts out as r x m).  If flags & SCRAMBLED_OUT, then the last transpose
+   is skipped (the array ends up as m x r).  To make sure the forward
+   and backward transforms use the same "scrambling" format, we swap r
+   and m when sign != FFT_SIGN.
+
+   There are some downsides to this, especially in the case where
+   either m or r is not divisible by n_pes.  For one thing, it means
+   that in general we can't use the same block size for the input and
+   output.  For another thing, it means that we can't in general honor
+   a user's "requested" block sizes in d.b[].  Therefore, for simplicity,
+   we simply ignore d.b[] for now.
+*/
+INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
+		     INT rblock[2], INT mblock[2])
+{
+     INT r, m;
+
+     UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
+
+     /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
+        This not only ensures that the input (the m dimension) is
+        equally distributed if possible, and at the r dimension is
+        maximally equally distributed (if d.n/n_pes >= n_pes), it also
+        makes one of the local transpositions in the algorithm
+        trivial. */
+     if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
+	 && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
+	  r = d.n / n_pes;
+     else {  /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
+	  for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
+	       ;
+     }
+     if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
+
+     if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
+	  m = r;
+	  r = d.n / m;
+     }
+     else
+	  m = d.n / r;
+
+     rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
+     mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
+
+     return r;
+}
--- a/fftw-3.3.10/mpi/conf.c
+++ b/fftw-3.3.10/mpi/conf.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+
+#include "mpi-transpose.h"
+#include "mpi-dft.h"
+#include "mpi-rdft.h"
+#include "mpi-rdft2.h"
+
+static const solvtab s =
+{
+     SOLVTAB(XM(transpose_pairwise_register)),
+     SOLVTAB(XM(transpose_alltoall_register)),
+     SOLVTAB(XM(transpose_recurse_register)),
+     SOLVTAB(XM(dft_rank_geq2_register)),
+     SOLVTAB(XM(dft_rank_geq2_transposed_register)),
+     SOLVTAB(XM(dft_serial_register)),
+     SOLVTAB(XM(dft_rank1_bigvec_register)),
+     SOLVTAB(XM(dft_rank1_register)),
+     SOLVTAB(XM(rdft_rank_geq2_register)),
+     SOLVTAB(XM(rdft_rank_geq2_transposed_register)),
+     SOLVTAB(XM(rdft_serial_register)),
+     SOLVTAB(XM(rdft_rank1_bigvec_register)),
+     SOLVTAB(XM(rdft2_rank_geq2_register)),
+     SOLVTAB(XM(rdft2_rank_geq2_transposed_register)),
+     SOLVTAB(XM(rdft2_serial_register)),
+     SOLVTAB_END
+};
+
+void XM(conf_standard)(planner *p)
+{
+     X(solvtab_exec)(s, p);
+}
--- a/fftw-3.3.10/mpi/dft-problem.c
+++ b/fftw-3.3.10/mpi/dft-problem.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-dft.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_dft *ego = (problem_mpi_dft *) ego_;
+     XM(dtensor_destroy)(ego->sz);
+     MPI_Comm_free(&ego->comm);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     int i;
+     X(md5puts)(m, "mpi-dft");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(ialignment_of)(p->I));
+	X(md5int)(m, X(ialignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     XM(dtensor_md5)(m, p->sz);
+     X(md5INT)(m, p->vn);
+     X(md5int)(m, p->sign);
+     X(md5int)(m, p->flags);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
+     int i;
+     p->print(p, "(mpi-dft %d %d %d ", 
+	      ego->I == ego->O,
+	      X(ialignment_of)(ego->I),
+	      X(ialignment_of)(ego->O));
+     XM(dtensor_print)(ego->sz, p);
+     p->print(p, " %D %d %d", ego->vn, ego->sign, ego->flags);
+     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
+     R *I = ego->I;
+     INT i, N;
+     int my_pe;
+
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N = 2 * ego->vn * XM(total_block)(ego->sz, IB, my_pe);
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_DFT,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
+			   R *I, R *O,
+			   MPI_Comm comm,
+			   int sign,
+			   unsigned flags)
+{
+     problem_mpi_dft *ego =
+          (problem_mpi_dft *)X(mkproblem)(sizeof(problem_mpi_dft), &padt);
+     int n_pes;
+
+     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
+     MPI_Comm_size(comm, &n_pes);
+     A(n_pes >= XM(num_blocks_total)(sz, IB)
+       && n_pes >= XM(num_blocks_total)(sz, OB));
+     A(vn >= 0);
+     A(sign == -1 || sign == 1);
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->sz = XM(dtensor_canonical)(sz, 1);
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     ego->sign = sign;
+
+     /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
+        swapping the first two dimensions (for rnk > 1) */
+     if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
+	  ddim dim0 = ego->sz->dims[0];
+	  ego->sz->dims[0] = ego->sz->dims[1];
+	  ego->sz->dims[1] = dim0;
+	  flags &= ~TRANSPOSED_IN;
+	  flags ^= TRANSPOSED_OUT;
+     }
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
+
+problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
+			     R *I, R *O,
+			     MPI_Comm comm,
+			     int sign,
+			     unsigned flags)
+{
+     problem *p = XM(mkproblem_dft)(sz, vn, I, O, comm, sign, flags);
+     XM(dtensor_destroy)(sz);
+     return p;
+}
--- a/fftw-3.3.10/mpi/dft-rank-geq2-transposed.c
+++ b/fftw-3.3.10/mpi/dft-rank-geq2-transposed.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is transposed both
+   in data distribution and in ordering (for the first 2 dimensions).
+
+   (Note that we don't have to handle the case where the input is
+   transposed, since this is equivalent to transposed output with the
+   first two dimensions swapped, and is automatically canonicalized as
+   such by dft-problem.c. */
+
+#include "mpi-dft.h"
+#include "mpi-transpose.h"
+#include "dft/dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     plan *cld1, *cldt, *cld2;
+     INT roff, ioff;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld1, *cld2;
+     plan_rdft *cldt;
+     INT roff = ego->roff, ioff = ego->ioff;
+     
+     /* DFT local dimensions */
+     cld1 = (plan_dft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* DFT final local dimension */
+     cld2 = (plan_dft *) ego->cld2;
+     cld2->apply(ego->cld2, O+roff, O+ioff, O+roff, O+ioff);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == TRANSPOSED_OUT
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(2, p->sz, OB)
+	     && XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+		 || !XM(dft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldt, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld1 = 0, *cldt = 0, *cld2 = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     tensor *sz;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     X(extract_reim)(p->sign, I = p->I, &ri, &ii);
+     X(extract_reim)(p->sign, O = p->O, &ro, &io);
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     else { 
+	  ro = ri;
+	  io = ii;
+     }
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_dft_d)(sz,
+                                                X(mktensor_2d)(b, is, is,
+                                                               p->vn, 2, 2),
+                                                ri, ii, ro, io));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     nrest *= p->vn;
+     cldt = X(mkplan_d)(plnr,
+			XM(mkproblem_transpose)(
+			     p->sz->dims[0].n, p->sz->dims[1].n, nrest * 2,
+			     I, O,
+			     p->sz->dims[0].b[IB], p->sz->dims[1].b[OB], 
+			     p->comm, 0));
+     if (XM(any_true)(!cldt, p->comm)) goto nada;
+
+     X(extract_reim)(p->sign, O, &ro, &io);
+     {
+	  INT is = p->sz->dims[0].n * nrest * 2;
+	  INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
+	  cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_dft_d)(X(mktensor_1d)(
+						     p->sz->dims[0].n,
+						     nrest * 2, nrest * 2),
+						X(mktensor_2d)(b, is, is,
+							       nrest, 2, 2),
+						ro, io, ro, io));
+	  if (XM(any_true)(!cld2, p->comm)) goto nada;
+     }
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cldt = cldt;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->roff = ri - p->I;
+     pln->ioff = ii - p->I;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank_geq2_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/dft-rank-geq2.c
+++ b/fftw-3.3.10/mpi/dft-rank-geq2.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is not transposed. */
+
+#include "mpi-dft.h"
+#include "dft/dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     plan *cld1, *cld2;
+     INT roff, ioff;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld1;
+     plan_rdft *cld2;
+     INT roff = ego->roff, ioff = ego->ioff;
+     
+     /* DFT local dimensions */
+     cld1 = (plan_dft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
+
+     /* DFT non-local dimension (via dft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(1, p->sz, OB)
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+		 || !XM(dft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-rank-geq2%s%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     tensor *sz;
+     dtensor *sz2;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     X(extract_reim)(p->sign, I = p->I, &ri, &ii);
+     X(extract_reim)(p->sign, O = p->O, &ro, &io);
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     else { 
+	  ro = ri;
+	  io = ii;
+     }
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = X(tensor_sz)(sz);
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_dft_d)(sz,
+                                                X(mktensor_2d)(b, is, is,
+                                                               p->vn, 2, 2),
+                                                ri, ii, ro, io));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
+     sz2->dims[0] = p->sz->dims[0];
+     cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
+						  I, O, p->comm, p->sign, 
+						  RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld2, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->roff = ri - p->I;
+     pln->ioff = ii - p->I;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank_geq2_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/dft-rank1-bigvec.c
+++ b/fftw-3.3.10/mpi/dft-rank1-bigvec.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank == 1 when the vector length vn is >= # processes.
+   In this case, we don't need to use a six-step type algorithm, and can
+   instead transpose the DFT dimension with the vector dimension to 
+   make the DFT local. */
+
+#include "mpi-dft.h"
+#include "mpi-transpose.h"
+#include "dft/dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+     rearrangement rearrange;
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     plan *cldt_before, *cld, *cldt_after;
+     INT roff, ioff;
+     int preserve_input;
+     rearrangement rearrange;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     plan_rdft *cldt_before, *cldt_after;
+     INT roff = ego->roff, ioff = ego->ioff;
+     
+     /* global transpose */
+     cldt_before = (plan_rdft *) ego->cldt_before;
+     cldt_before->apply(ego->cldt_before, I, O);
+     
+     if (ego->preserve_input) I = O;
+	  
+     /* 1d DFT(s) */
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, O+roff, O+ioff, I+roff, I+ioff);
+     
+     /* global transpose */
+     cldt_after = (plan_rdft *) ego->cldt_after;
+     cldt_after->apply(ego->cldt_after, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->sz->rnk == 1
+	     && !(p->flags & ~RANK1_BIGVEC_ONLY)
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && (p->vn >= n_pes /* TODO: relax this, using more memory? */
+		 || (p->flags & RANK1_BIGVEC_ONLY))
+
+	     && XM(rearrange_applicable)(ego->rearrange,
+					 p->sz->dims[0], p->vn, n_pes)
+
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+                 || !XM(dft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldt_before, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldt_after, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldt_after);
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldt_before);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const char descrip[][16] = { "contig", "discontig", "square-after",
+				  "square-middle", "square-before" };
+     p->print(p, "(mpi-dft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
+	      descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
+	      ego->cldt_before, ego->cld, ego->cldt_after);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     INT yblock, yb, nx, ny, vn;
+     int my_pe, n_pes;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+     
+     nx = p->sz->dims[0].n;
+     if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
+	  return (plan *) 0;
+     vn = p->vn / ny;
+     A(ny * vn == p->vn);
+
+     yblock = XM(default_block)(ny, n_pes);
+     cldt_before = X(mkplan_d)(plnr,
+			       XM(mkproblem_transpose)(
+				    nx, ny, vn*2,
+				    I = p->I, O = p->O,
+				    p->sz->dims[0].b[IB], yblock,
+				    p->comm, 0));
+     if (XM(any_true)(!cldt_before, p->comm)) goto nada;	  
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
+     
+     X(extract_reim)(p->sign, I, &ri, &ii);
+     X(extract_reim)(p->sign, O, &ro, &io);
+
+     yb = XM(block)(ny, yblock, my_pe);
+     cld = X(mkplan_d)(plnr,
+		       X(mkproblem_dft_d)(X(mktensor_1d)(nx, vn*2, vn*2),
+					  X(mktensor_2d)(yb, vn*2*nx, vn*2*nx,
+							 vn, 2, 2),
+					  ro, io, ri, ii));
+     if (XM(any_true)(!cld, p->comm)) goto nada;	  
+     
+     cldt_after = X(mkplan_d)(plnr,
+			      XM(mkproblem_transpose)(
+				   ny, nx, vn*2,
+				   I, O,
+				   yblock, p->sz->dims[0].b[OB], 
+				   p->comm, 0));
+     if (XM(any_true)(!cldt_after, p->comm)) goto nada;	  
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+
+     pln->cldt_before = cldt_before;
+     pln->cld = cld;
+     pln->cldt_after = cldt_after;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->roff = ro - p->O;
+     pln->ioff = io - p->O;
+     pln->rearrange = ego->rearrange;
+
+     X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldt_after);
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldt_before);
+     return (plan *) 0;
+}
+
+static solver *mksolver(rearrangement rearrange, int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->rearrange = rearrange;
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank1_bigvec_register)(planner *p)
+{
+     rearrangement rearrange;
+     int preserve_input;
+     FORALL_REARRANGE(rearrange)
+	  for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	       REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
+}
--- a/fftw-3.3.10/mpi/dft-rank1.c
+++ b/fftw-3.3.10/mpi/dft-rank1.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex DFTs of rank == 1 via six-step algorithm. */
+
+#include "mpi-dft.h"
+#include "mpi-transpose.h"
+#include "dft/dft.h"
+
+typedef struct {
+     solver super;
+     rdftapply apply; /* apply_ddft_first or apply_ddft_last */
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_dft super;
+
+     triggen *t;
+     plan *cldt, *cld_ddft, *cld_dft;
+     INT roff, ioff;
+     int preserve_input;
+     INT vn, xmin, xmax, xs, m, r;
+} P;
+
+static void do_twiddle(triggen *t, INT ir, INT m, INT vn, R *xr, R *xi)
+{
+     void (*rotate)(triggen *, INT, R, R, R *) = t->rotate;
+     INT im, iv;
+     for (im = 0; im < m; ++im)
+	  for (iv = 0; iv < vn; ++iv) {
+	       /* TODO: modify/inline rotate function
+		  so that it can do whole vn vector at once? */
+	       R c[2];
+	       rotate(t, ir * im, *xr, *xi, c);
+	       *xr = c[0]; *xi = c[1];
+	       xr += 2; xi += 2;
+	  }
+}
+
+/* radix-r DFT of size r*m.  This is equivalent to an m x r 2d DFT,
+   plus twiddle factors between the size-m and size-r 1d DFTs, where
+   the m dimension is initially distributed.  The output is transposed
+   to r x m where the r dimension is distributed. 
+
+   This algorithm follows the general sequence:
+        global transpose (m x r -> r x m)
+        DFTs of size m
+	multiply by twiddles + global transpose (r x m -> m x r)
+	DFTs of size r
+	global transpose (m x r -> r x m)
+   where the multiplication by twiddles can come before or after
+   the middle transpose.  The first/last transposes are omitted
+   for SCRAMBLED_IN/OUT formats, respectively.
+
+   However, we wish to exploit our dft-rank1-bigvec solver, which
+   solves a vector of distributed DFTs via transpose+dft+transpose.
+   Therefore, we can group *either* the DFTs of size m *or* the
+   DFTs of size r with their surrounding transposes as a single
+   distributed-DFT (ddft) plan.  These two variations correspond to
+   apply_ddft_first or apply_ddft_last, respectively.
+*/
+
+static void apply_ddft_first(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld_dft;
+     plan_rdft *cldt, *cld_ddft;
+     INT roff, ioff, im, mmax, ms, r, vn;
+     triggen *t;
+     R *dI, *dO;
+
+     /* distributed size-m DFTs, with output in m x r format */
+     cld_ddft = (plan_rdft *) ego->cld_ddft;
+     cld_ddft->apply(ego->cld_ddft, I, O);
+
+     cldt = (plan_rdft *) ego->cldt;
+     if (ego->preserve_input || !cldt) I = O;
+
+     /* twiddle multiplications, followed by 1d DFTs of size-r */
+     cld_dft = (plan_dft *) ego->cld_dft;
+     roff = ego->roff; ioff = ego->ioff;
+     mmax = ego->xmax; ms = ego->xs;
+     t = ego->t; r = ego->r; vn = ego->vn;
+     dI = O; dO = I;
+     for (im = ego->xmin; im <= mmax; ++im) {
+	  do_twiddle(t, im, r, vn, dI+roff, dI+ioff);
+	  cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
+	  dI += ms; dO += ms;
+     }
+
+     /* final global transpose (m x r -> r x m), if not SCRAMBLED_OUT */
+     if (cldt) 
+	  cldt->apply((plan *) cldt, I, O);
+}
+
+static void apply_ddft_last(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld_dft;
+     plan_rdft *cldt, *cld_ddft;
+     INT roff, ioff, ir, rmax, rs, m, vn;
+     triggen *t;
+     R *dI, *dO0, *dO;
+
+     /* initial global transpose (m x r -> r x m), if not SCRAMBLED_IN */
+     cldt = (plan_rdft *) ego->cldt;
+     if (cldt) {
+	  cldt->apply((plan *) cldt, I, O);
+	  dI = O;
+     }
+     else 
+	  dI = I;
+     if (ego->preserve_input) dO = O; else dO = I;
+     dO0 = dO;
+
+     /* 1d DFTs of size m, followed by twiddle multiplications */
+     cld_dft = (plan_dft *) ego->cld_dft;
+     roff = ego->roff; ioff = ego->ioff;
+     rmax = ego->xmax; rs = ego->xs;
+     t = ego->t; m = ego->m; vn = ego->vn;
+     for (ir = ego->xmin; ir <= rmax; ++ir) {
+	  cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
+	  do_twiddle(t, ir, m, vn, dO+roff, dO+ioff);
+	  dI += rs; dO += rs;
+     }
+
+     /* distributed size-r DFTs, with output in r x m format */
+     cld_ddft = (plan_rdft *) ego->cld_ddft;
+     cld_ddft->apply(ego->cld_ddft, dO0, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr,
+		      INT *r, INT rblock[2], INT mblock[2])
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->sz->rnk == 1
+
+	     && ONLY_SCRAMBLEDP(p->flags)
+
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+                                          && p->I != p->O))
+
+	     && (!(p->flags & SCRAMBLED_IN) || ego->apply == apply_ddft_last)
+	     && (!(p->flags & SCRAMBLED_OUT) || ego->apply == apply_ddft_first)
+
+	     && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
+                 || !XM(dft_serial_applicable)(p))
+
+	     /* disallow if dft-rank1-bigvec is applicable since the
+		data distribution may be slightly different (ugh!) */
+	     && (p->vn < n_pes || p->flags)
+
+	     && (*r = XM(choose_radix)(p->sz->dims[0], n_pes,
+				       p->flags, p->sign,
+				       rblock, mblock))
+
+	     /* ddft_first or last has substantial advantages in the
+		bigvec transpositions for the common case where
+		n_pes == n/r or r, respectively */
+	     && (!NO_UGLYP(plnr)
+		 || !(*r == n_pes && ego->apply == apply_ddft_first)
+		 || !(p->sz->dims[0].n / *r == n_pes 
+		      && ego->apply == apply_ddft_last))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld_dft, wakefulness);
+     X(plan_awake)(ego->cld_ddft, wakefulness);
+
+     switch (wakefulness) {
+         case SLEEPY:
+              X(triggen_destroy)(ego->t); ego->t = 0;
+              break;
+         default:
+              ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
+              break;
+     }
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld_dft);
+     X(plan_destroy_internal)(ego->cld_ddft);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-rank1/%D%s%s%(%p%)%(%p%)%(%p%))",
+	      ego->r,
+	      ego->super.apply == apply_ddft_first ? "/first" : "/last",
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld_ddft, ego->cld_dft, ego->cldt);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_dft *p;
+     P *pln;
+     plan *cld_dft = 0, *cld_ddft = 0, *cldt = 0;
+     R *ri, *ii, *ro, *io, *I, *O;
+     INT r, rblock[2], m, mblock[2], rp, mp, mpblock[2], mpb;
+     int my_pe, n_pes, preserve_input, ddft_first;
+     dtensor *sz;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr, &r, rblock, mblock))
+          return (plan *) 0;
+
+     p = (const problem_mpi_dft *) p_;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     m = p->sz->dims[0].n / r;
+
+     /* some hackery so that we can plan both ddft_first and ddft_last
+	as if they were ddft_first */
+     if ((ddft_first = (ego->apply == apply_ddft_first))) {
+	  rp = r; mp = m;
+	  mpblock[IB] = mblock[IB]; mpblock[OB] = mblock[OB];
+	  mpb = XM(block)(mp, mpblock[OB], my_pe);
+     }
+     else {
+	  rp = m; mp = r;
+	  mpblock[IB] = rblock[IB]; mpblock[OB] = rblock[OB];
+	  mpb = XM(block)(mp, mpblock[IB], my_pe);
+     }
+
+     preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     sz = XM(mkdtensor)(1);
+     sz->dims[0].n = mp;
+     sz->dims[0].b[IB] = mpblock[IB];
+     sz->dims[0].b[OB] = mpblock[OB];
+     I = (ddft_first || !preserve_input) ? p->I : p->O;
+     O = p->O;
+     cld_ddft = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz, rp * p->vn,
+						      I, O, p->comm, p->sign,
+						      RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld_ddft, p->comm)) goto nada;
+
+     I = TAINT((ddft_first || !p->flags) ? p->O : p->I, rp * p->vn * 2);
+     O = TAINT((preserve_input || (ddft_first && p->flags)) ? p->O : p->I, 
+	       rp * p->vn * 2);
+     X(extract_reim)(p->sign, I, &ri, &ii);
+     X(extract_reim)(p->sign, O, &ro, &io);
+     cld_dft = X(mkplan_d)(plnr,
+			X(mkproblem_dft_d)(X(mktensor_1d)(rp, p->vn*2,p->vn*2),
+					   X(mktensor_1d)(p->vn, 2, 2),
+					   ri, ii, ro, io));
+     if (XM(any_true)(!cld_dft, p->comm)) goto nada;
+     
+     if (!p->flags) { /* !(SCRAMBLED_IN or SCRAMBLED_OUT) */
+	  I = (ddft_first && preserve_input) ? p->O : p->I;
+	  O = p->O;
+	  cldt = X(mkplan_d)(plnr,
+			     XM(mkproblem_transpose)(
+				  m, r, p->vn * 2,
+				  I, O,
+				  ddft_first ? mblock[OB] : mblock[IB],
+				  ddft_first ? rblock[OB] : rblock[IB],
+				  p->comm, 0));
+	  if (XM(any_true)(!cldt, p->comm)) goto nada;	  
+     }
+
+     pln = MKPLAN_MPI_DFT(P, &padt, ego->apply);
+
+     pln->cld_ddft = cld_ddft;
+     pln->cld_dft = cld_dft;
+     pln->cldt = cldt;
+     pln->preserve_input = preserve_input;
+     X(extract_reim)(p->sign, p->O, &ro, &io);
+     pln->roff = ro - p->O;
+     pln->ioff = io - p->O;
+     pln->vn = p->vn;
+     pln->m = m;
+     pln->r = r;
+     pln->xmin = (ddft_first ? mblock[OB] : rblock[IB]) * my_pe;
+     pln->xmax = pln->xmin + mpb - 1;
+     pln->xs = rp * p->vn * 2;
+     pln->t = 0;
+
+     X(ops_add)(&cld_ddft->ops, &cld_dft->ops, &pln->super.super.ops);
+     if (cldt) X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+     {
+          double n0 = (1 + pln->xmax - pln->xmin) * (mp - 1) * pln->vn;
+          pln->super.super.ops.mul += 8 * n0;
+          pln->super.super.ops.add += 4 * n0;
+          pln->super.super.ops.other += 8 * n0;
+     }
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld_dft);
+     X(plan_destroy_internal)(cld_ddft);
+     return (plan *) 0;
+}
+
+static solver *mksolver(rdftapply apply, int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->apply = apply;
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(dft_rank1_register)(planner *p)
+{
+     rdftapply apply[] = { apply_ddft_first, apply_ddft_last };
+     unsigned int iapply;
+     int preserve_input;
+     for (iapply = 0; iapply < sizeof(apply) / sizeof(apply[0]); ++iapply)
+	  for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	       REGISTER_SOLVER(p, mksolver(apply[iapply], preserve_input));
+}
--- a/fftw-3.3.10/mpi/dft-serial.c
+++ b/fftw-3.3.10/mpi/dft-serial.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* "MPI" DFTs where all of the data is on one processor...just
+   call through to serial API. */
+
+#include "mpi-dft.h"
+#include "dft/dft.h"
+
+typedef struct {
+     plan_mpi_dft super;
+     plan *cld;
+     INT roff, ioff;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_dft *cld;
+     INT roff = ego->roff, ioff = ego->ioff;
+     cld = (plan_dft *) ego->cld;
+     cld->apply(ego->cld, I+roff, I+ioff, O+roff, O+ioff);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-dft-serial %(%p%))", ego->cld);
+}
+
+int XM(dft_serial_applicable)(const problem_mpi_dft *p)
+{
+     return (1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
+		 || p->vn == 0));
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     P *pln;
+     plan *cld;
+     int my_pe;
+     R *ri, *ii, *ro, *io;
+     static const plan_adt padt = {
+          XM(dft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     /* check whether applicable: */
+     if (!XM(dft_serial_applicable)(p))
+          return (plan *) 0;
+
+     X(extract_reim)(p->sign, p->I, &ri, &ii);
+     X(extract_reim)(p->sign, p->O, &ro, &io);
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     if (my_pe == 0 && p->vn > 0) {
+	  int i, rnk = p->sz->rnk;
+	  tensor *sz = X(mktensor)(p->sz->rnk);
+	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
+	  for (i = rnk - 1; i > 0; --i) {
+	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
+		    sz->dims[i].is * sz->dims[i].n;
+	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
+	  }
+	  
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_dft_d)(sz,
+					       X(mktensor_1d)(p->vn, 2, 2),
+					       ri, ii, ro, io));
+     }
+     else { /* idle process: make nop plan */
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_dft_d)(X(mktensor_0d)(),
+					       X(mktensor_1d)(0,0,0),
+					       ri, ii, ro, io));
+     }
+     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
+
+     pln = MKPLAN_MPI_DFT(P, &padt, apply);
+     pln->cld = cld;
+     pln->roff = ro - p->O;
+     pln->ioff = io - p->O;
+     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void XM(dft_serial_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
--- a/fftw-3.3.10/mpi/dft-solve.c
+++ b/fftw-3.3.10/mpi/dft-solve.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-dft.h"
+
+/* use the apply() operation for MPI_DFT problems */
+void XM(dft_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_dft *ego = (const plan_mpi_dft *) ego_;
+     const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
--- a/fftw-3.3.10/mpi/dtensor.c
+++ b/fftw-3.3.10/mpi/dtensor.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+dtensor *XM(mkdtensor)(int rnk) 
+{
+     dtensor *x;
+
+     A(rnk >= 0);
+
+#if defined(STRUCT_HACK_KR)
+     if (FINITE_RNK(rnk) && rnk > 1)
+	  x = (dtensor *)MALLOC(sizeof(dtensor) + (rnk - 1) * sizeof(ddim),
+				    TENSORS);
+     else
+	  x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
+#elif defined(STRUCT_HACK_C99)
+     if (FINITE_RNK(rnk))
+	  x = (dtensor *)MALLOC(sizeof(dtensor) + rnk * sizeof(ddim),
+				    TENSORS);
+     else
+	  x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
+#else
+     x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
+     if (FINITE_RNK(rnk) && rnk > 0)
+          x->dims = (ddim *)MALLOC(sizeof(ddim) * rnk, TENSORS);
+     else
+          x->dims = 0;
+#endif
+
+     x->rnk = rnk;
+     return x;
+}
+
+void XM(dtensor_destroy)(dtensor *sz)
+{
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(sz->dims);
+#endif
+     X(ifree)(sz);
+}
+
+void XM(dtensor_md5)(md5 *p, const dtensor *t)
+{
+     int i;
+     X(md5int)(p, t->rnk);
+     if (FINITE_RNK(t->rnk)) {
+          for (i = 0; i < t->rnk; ++i) {
+               const ddim *q = t->dims + i;
+               X(md5INT)(p, q->n);
+               X(md5INT)(p, q->b[IB]);
+               X(md5INT)(p, q->b[OB]);
+          }
+     }
+}
+
+dtensor *XM(dtensor_copy)(const dtensor *sz)
+{
+     dtensor *x = XM(mkdtensor)(sz->rnk);
+     int i;
+     if (FINITE_RNK(sz->rnk))
+          for (i = 0; i < sz->rnk; ++i)
+               x->dims[i] = sz->dims[i];
+     return x;
+}
+
+dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress)
+{
+     int i, rnk;
+     dtensor *x;
+     block_kind k;
+
+     if (!FINITE_RNK(sz->rnk))
+	  return XM(mkdtensor)(sz->rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+	  if (sz->dims[i].n <= 0)
+	       return XM(mkdtensor)(RNK_MINFTY);
+	  else if (!compress || sz->dims[i].n > 1)
+	       ++rnk;
+     }
+     x = XM(mkdtensor)(rnk);
+     for (i = rnk = 0; i < sz->rnk; ++i) {
+	  if (!compress || sz->dims[i].n > 1) {
+               x->dims[rnk].n = sz->dims[i].n;
+	       FORALL_BLOCK_KIND(k) {
+		    if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]) == 1)
+			 x->dims[rnk].b[k] = sz->dims[i].n;
+		    else
+			 x->dims[rnk].b[k] = sz->dims[i].b[k];
+	       }
+	       ++rnk;
+	  }
+     }
+     return x;
+}
+
+int XM(dtensor_validp)(const dtensor *sz)
+{
+     int i;
+     if (sz->rnk < 0) return 0;
+     if (FINITE_RNK(sz->rnk))
+	  for (i = 0; i < sz->rnk; ++i)
+	       if (sz->dims[i].n < 0
+		   || sz->dims[i].b[IB] <= 0
+		   || sz->dims[i].b[OB] <= 0)
+		    return 0;
+     return 1;
+}
+
+void XM(dtensor_print)(const dtensor *t, printer *p)
+{
+     if (FINITE_RNK(t->rnk)) {
+          int i;
+          int first = 1;
+          p->print(p, "(");
+          for (i = 0; i < t->rnk; ++i) {
+               const ddim *d = t->dims + i;
+               p->print(p, "%s(%D %D %D)",
+                        first ? "" : " ",
+                        d->n, d->b[IB], d->b[OB]);
+               first = 0;
+          }
+          p->print(p, ")");
+     } else {
+          p->print(p, "rank-minfty");
+     }
+
+}
--- a/fftw-3.3.10/mpi/f03-wrap.c
+++ b/fftw-3.3.10/mpi/f03-wrap.c
@@ -0,0 +1,284 @@
+/* Generated automatically.  DO NOT EDIT! */
+
+#include "fftw3-mpi.h"
+#include "ifftw-mpi.h"
+
+FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
+FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
+FFTW_EXTERN X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
+FFTW_EXTERN void XM(gather_wisdom_f03)(MPI_Fint f_comm_);
+FFTW_EXTERN void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_);
+
+ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_many_transposed)(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_many)(rnk,n,howmany,block0,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_transposed)(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size)(rnk,n,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_many_1d)(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
+}
+
+ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_1d)(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
+}
+
+ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_2d)(n0,n1,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_2d_transposed)(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_3d)(n0,n1,n2,comm,local_n0,local_0_start);
+}
+
+ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(local_size_3d_transposed)(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start);
+}
+
+X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_transpose)(n0,n1,howmany,block0,block1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_transpose)(n0,n1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_dft)(rnk,n,howmany,block,tblock,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft)(rnk,n,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_1d)(n0,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_2d)(n0,n1,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_3d)(n0,n1,n2,in,out,comm,sign,flags);
+}
+
+X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_r2r)(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags);
+}
+
+X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_r2r)(rnk,n,in,out,comm,kind,flags);
+}
+
+X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_r2r_2d)(n0,n1,in,out,comm,kind0,kind1,flags);
+}
+
+X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_r2r_3d)(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags);
+}
+
+X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_dft_r2c)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_r2c)(rnk,n,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_r2c_2d)(n0,n1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_r2c_3d)(n0,n1,n2,in,out,comm,flags);
+}
+
+X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_many_dft_c2r)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_c2r)(rnk,n,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_c2r_2d)(n0,n1,in,out,comm,flags);
+}
+
+X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
+{
+     MPI_Comm comm;
+
+     comm = MPI_Comm_f2c(f_comm);
+     return XM(plan_dft_c2r_3d)(n0,n1,n2,in,out,comm,flags);
+}
+
+void XM(gather_wisdom_f03)(MPI_Fint f_comm_)
+{
+     MPI_Comm comm_;
+
+     comm_ = MPI_Comm_f2c(f_comm_);
+     XM(gather_wisdom)(comm_);
+}
+
+void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_)
+{
+     MPI_Comm comm_;
+
+     comm_ = MPI_Comm_f2c(f_comm_);
+     XM(broadcast_wisdom)(comm_);
+}
--- a/fftw-3.3.10/mpi/f03-wrap.sh
+++ b/fftw-3.3.10/mpi/f03-wrap.sh
@@ -0,0 +1,22 @@
+#! /bin/sh
+
+# Script to generate Fortran 2003 wrappers for FFTW's MPI functions.  This
+# is necessary because MPI provides no way to deal with C MPI_Comm handles
+# from Fortran (where MPI_Comm == integer), but does provide a way to
+# deal with Fortran MPI_Comm handles from C (via MPI_Comm_f2c).  So,
+# every FFTW function that takes an MPI_Comm argument needs a wrapper
+# function that takes a Fortran integer and converts it to MPI_Comm.
+
+echo "/* Generated automatically.  DO NOT EDIT! */"
+echo
+
+echo "#include \"fftw3-mpi.h\""
+echo "#include \"ifftw-mpi.h\""
+echo
+
+# Declare prototypes using FFTW_EXTERN, important for Windows DLLs
+grep -v 'mpi.h' fftw3-mpi.h | gcc -E -I../api - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl | grep "MPI_Fint" | sed 's/^/FFTW_EXTERN /;s/$/;/'
+
+grep -v 'mpi.h' fftw3-mpi.h | gcc -E -I../api - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl
+
+
--- a/fftw-3.3.10/mpi/f03api.sh
+++ b/fftw-3.3.10/mpi/f03api.sh
@@ -0,0 +1,43 @@
+#! /bin/sh
+
+# Script to generate Fortran 2003 interface declarations for FFTW's MPI
+# interface from the fftw3-mpi.h header file.
+
+# This is designed so that the Fortran caller can do:
+#   use, intrinsic :: iso_c_binding
+#   implicit none
+#   include 'fftw3-mpi.f03'
+# and then call the C FFTW MPI functions directly, with type checking.
+#
+# One caveat: because there is no standard way to conver MPI_Comm objects
+# from Fortran (= integer) to C (= opaque type), the Fortran interface
+# technically calls C wrapper functions (also auto-generated) which
+# call MPI_Comm_f2c to convert the communicators as needed.
+
+echo "! Generated automatically.  DO NOT EDIT!"
+echo
+
+echo "  include 'fftw3.f03'"
+echo
+
+# Extract constants
+perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n  integer\(C_INTPTR_T\), parameter :: \1 = \2\n/g' < fftw3-mpi.h | grep 'integer(C_INTPTR_T)'
+perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n  integer\(C_INT\), parameter :: $1 = ",$2 << $3,"\n"; }' < fftw3-mpi.h | grep 'integer(C_INT)'
+
+# Extract function declarations
+for p in $*; do
+    if test "$p" = "d"; then p=""; fi
+
+    echo
+    cat <<EOF
+  type, bind(C) :: fftw${p}_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftw${p}_mpi_ddim
+EOF
+
+    echo
+    echo "  interface"
+    grep -v 'mpi.h' fftw3-mpi.h | gcc -I../api -D__GNUC__=5 -D__i386__ -E - |grep "fftw${p}_mpi_init" |tr ';' '\n' | perl ../api/genf03.pl
+    echo "  end interface"
+
+done
--- a/fftw-3.3.10/mpi/fftw3-mpi.f03.in
+++ b/fftw-3.3.10/mpi/fftw3-mpi.f03.in
@@ -0,0 +1,810 @@
+! Generated automatically.  DO NOT EDIT!
+
+  include 'fftw3.f03'
+
+  integer(C_INTPTR_T), parameter :: FFTW_MPI_DEFAULT_BLOCK = 0
+  integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_IN = 134217728
+  integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_OUT = 268435456
+  integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_IN = 536870912
+  integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_OUT = 1073741824
+
+  type, bind(C) :: fftw_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftw_mpi_ddim
+
+  interface
+    subroutine fftw_mpi_init() bind(C, name='fftw_mpi_init')
+      import
+    end subroutine fftw_mpi_init
+    
+    subroutine fftw_mpi_cleanup() bind(C, name='fftw_mpi_cleanup')
+      import
+    end subroutine fftw_mpi_cleanup
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
+                                                                     local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_many_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_many_transposed
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftw_mpi_local_size_many_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size_many
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_transposed
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftw_mpi_local_size_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
+                                                             local_o_start) bind(C, name='fftw_mpi_local_size_many_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftw_mpi_local_size_many_1d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
+                                 bind(C, name='fftw_mpi_local_size_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftw_mpi_local_size_1d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftw_mpi_local_size_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size_2d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_2d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_2d_transposed
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftw_mpi_local_size_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftw_mpi_local_size_3d
+    
+    integer(C_INTPTR_T) function fftw_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftw_mpi_local_size_3d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftw_mpi_local_size_3d_transposed
+    
+    type(C_PTR) function fftw_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
+                         bind(C, name='fftw_mpi_plan_many_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_transpose
+    
+    type(C_PTR) function fftw_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_transpose
+    
+    type(C_PTR) function fftw_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
+                         bind(C, name='fftw_mpi_plan_many_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block
+      integer(C_INTPTR_T), value :: tblock
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_dft
+    
+    type(C_PTR) function fftw_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft
+    
+    type(C_PTR) function fftw_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_1d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_2d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_3d
+    
+    type(C_PTR) function fftw_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
+                         bind(C, name='fftw_mpi_plan_many_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_r2r
+    
+    type(C_PTR) function fftw_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftw_mpi_plan_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_r2r
+    
+    type(C_PTR) function fftw_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftw_mpi_plan_r2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_r2r_2d
+    
+    type(C_PTR) function fftw_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) bind(C, name='fftw_mpi_plan_r2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_r2r_3d
+    
+    type(C_PTR) function fftw_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftw_mpi_plan_many_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_dft_r2c
+    
+    type(C_PTR) function fftw_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_r2c
+    
+    type(C_PTR) function fftw_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftw_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftw_mpi_plan_many_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_many_dft_c2r
+    
+    type(C_PTR) function fftw_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_c2r
+    
+    type(C_PTR) function fftw_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftw_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftw_mpi_plan_dft_c2r_3d
+    
+    subroutine fftw_mpi_gather_wisdom(comm_) bind(C, name='fftw_mpi_gather_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftw_mpi_gather_wisdom
+    
+    subroutine fftw_mpi_broadcast_wisdom(comm_) bind(C, name='fftw_mpi_broadcast_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftw_mpi_broadcast_wisdom
+    
+    subroutine fftw_mpi_execute_dft(p,in,out) bind(C, name='fftw_mpi_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_dft
+    
+    subroutine fftw_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftw_mpi_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_dft_r2c
+    
+    subroutine fftw_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftw_mpi_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_dft_c2r
+    
+    subroutine fftw_mpi_execute_r2r(p,in,out) bind(C, name='fftw_mpi_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftw_mpi_execute_r2r
+    
+  end interface
+
+  type, bind(C) :: fftwf_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftwf_mpi_ddim
+
+  interface
+    subroutine fftwf_mpi_init() bind(C, name='fftwf_mpi_init')
+      import
+    end subroutine fftwf_mpi_init
+    
+    subroutine fftwf_mpi_cleanup() bind(C, name='fftwf_mpi_cleanup')
+      import
+    end subroutine fftwf_mpi_cleanup
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
+                                                                      local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_many_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_many_transposed
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwf_mpi_local_size_many_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size_many
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_transposed
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwf_mpi_local_size_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
+                                                              local_o_start) bind(C, name='fftwf_mpi_local_size_many_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwf_mpi_local_size_many_1d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
+                                 bind(C, name='fftwf_mpi_local_size_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwf_mpi_local_size_1d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwf_mpi_local_size_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size_2d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_2d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_2d_transposed
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwf_mpi_local_size_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwf_mpi_local_size_3d
+    
+    integer(C_INTPTR_T) function fftwf_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwf_mpi_local_size_3d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwf_mpi_local_size_3d_transposed
+    
+    type(C_PTR) function fftwf_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_transpose
+    
+    type(C_PTR) function fftwf_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_transpose
+    
+    type(C_PTR) function fftwf_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block
+      integer(C_INTPTR_T), value :: tblock
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_dft
+    
+    type(C_PTR) function fftwf_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_1d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_3d
+    
+    type(C_PTR) function fftwf_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_r2r
+    
+    type(C_PTR) function fftwf_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwf_mpi_plan_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_r2r
+    
+    type(C_PTR) function fftwf_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwf_mpi_plan_r2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_r2r_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
+                         bind(C, name='fftwf_mpi_plan_r2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_r2r_3d
+    
+    type(C_PTR) function fftwf_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_r2c
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_FLOAT), dimension(*), intent(out) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwf_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwf_mpi_plan_many_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_c2r
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwf_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwf_mpi_plan_dft_c2r_3d
+    
+    subroutine fftwf_mpi_gather_wisdom(comm_) bind(C, name='fftwf_mpi_gather_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwf_mpi_gather_wisdom
+    
+    subroutine fftwf_mpi_broadcast_wisdom(comm_) bind(C, name='fftwf_mpi_broadcast_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwf_mpi_broadcast_wisdom
+    
+    subroutine fftwf_mpi_execute_dft(p,in,out) bind(C, name='fftwf_mpi_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_dft
+    
+    subroutine fftwf_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwf_mpi_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_dft_r2c
+    
+    subroutine fftwf_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwf_mpi_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_dft_c2r
+    
+    subroutine fftwf_mpi_execute_r2r(p,in,out) bind(C, name='fftwf_mpi_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_FLOAT), dimension(*), intent(inout) :: in
+      real(C_FLOAT), dimension(*), intent(out) :: out
+    end subroutine fftwf_mpi_execute_r2r
+    
+  end interface
--- a/fftw-3.3.10/mpi/fftw3-mpi.h
+++ b/fftw-3.3.10/mpi/fftw3-mpi.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_MPI_H
+#define FFTW3_MPI_H
+
+#include <fftw3.h>
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+struct fftw_mpi_ddim_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t ib;                    /* input block */
+     ptrdiff_t ob;                    /* output block */
+};
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  XM: name-mangling macro (MPI)
+  X: name-mangling macro (serial)
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_MPI_DEFINE_API(XM, X, R, C)			\
+								\
+typedef struct fftw_mpi_ddim_do_not_use_me XM(ddim);		\
+								\
+FFTW_EXTERN void XM(init)(void);				\
+FFTW_EXTERN void XM(cleanup)(void);				\
+								\
+FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed)		\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm,	\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+      ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_many)			\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t block0, MPI_Comm comm,				\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_transposed)			\
+     (int rnk, const ptrdiff_t *n, MPI_Comm comm,		\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+      ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size)				\
+     (int rnk, const ptrdiff_t *n, MPI_Comm comm,		\
+      ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_many_1d)(			\
+     ptrdiff_t n0, ptrdiff_t howmany,				\
+     MPI_Comm comm, int sign, unsigned flags,			\
+     ptrdiff_t *local_ni, ptrdiff_t *local_i_start,		\
+     ptrdiff_t *local_no, ptrdiff_t *local_o_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_1d)(			\
+     ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags,	\
+     ptrdiff_t *local_ni, ptrdiff_t *local_i_start,		\
+     ptrdiff_t *local_no, ptrdiff_t *local_o_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_2d)(			\
+     ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,			\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed)(		\
+     ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm,			\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+     ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_3d)(			\
+     ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,	\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start);		\
+FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed)(		\
+     ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm,	\
+     ptrdiff_t *local_n0, ptrdiff_t *local_0_start,		\
+     ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_transpose)			\
+     (ptrdiff_t n0, ptrdiff_t n1,				\
+      ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1,	\
+      R *in, R *out, MPI_Comm comm, unsigned flags);		\
+FFTW_EXTERN X(plan) XM(plan_transpose)				\
+     (ptrdiff_t n0, ptrdiff_t n1,				\
+      R *in, R *out, MPI_Comm comm, unsigned flags);		\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_dft)				\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t block, ptrdiff_t tblock, C *in, C *out,		\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft)				\
+     (int rnk, const ptrdiff_t *n, C *in, C *out,		\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft_1d)				\
+     (ptrdiff_t n0, C *in, C *out,				\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, C *in, C *out,		\
+      MPI_Comm comm, int sign, unsigned flags);			\
+FFTW_EXTERN X(plan) XM(plan_dft_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, C *out,	\
+      MPI_Comm comm, int sign, unsigned flags);			\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_r2r)				\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t iblock, ptrdiff_t oblock, R *in, R *out,	\
+      MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags);	\
+FFTW_EXTERN X(plan) XM(plan_r2r)				\
+     (int rnk, const ptrdiff_t *n, R *in, R *out,		\
+      MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags);	\
+FFTW_EXTERN X(plan) XM(plan_r2r_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, R *in, R *out, MPI_Comm comm,	\
+      X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags);	\
+FFTW_EXTERN X(plan) XM(plan_r2r_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2,			\
+      R *in, R *out, MPI_Comm comm, X(r2r_kind) kind0,		\
+      X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags);	\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_dft_r2c)			\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t iblock, ptrdiff_t oblock, R *in, C *out,	\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_r2c)				\
+     (int rnk, const ptrdiff_t *n, R *in, C *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, R *in, C *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R *in, C *out,	\
+      MPI_Comm comm, unsigned flags);				\
+								\
+FFTW_EXTERN X(plan) XM(plan_many_dft_c2r)			\
+     (int rnk, const ptrdiff_t *n, ptrdiff_t howmany,		\
+      ptrdiff_t iblock, ptrdiff_t oblock, C *in, R *out,	\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_c2r)				\
+     (int rnk, const ptrdiff_t *n, C *in, R *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, C *in, R *out,		\
+      MPI_Comm comm, unsigned flags);				\
+FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d)				\
+     (ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, R *out,	\
+      MPI_Comm comm, unsigned flags);				\
+								\
+FFTW_EXTERN void XM(gather_wisdom)(MPI_Comm comm_);		\
+FFTW_EXTERN void XM(broadcast_wisdom)(MPI_Comm comm_);          \
+								\
+FFTW_EXTERN void XM(execute_dft)(X(plan) p, C *in, C *out);	\
+FFTW_EXTERN void XM(execute_dft_r2c)(X(plan) p, R *in, C *out);	\
+FFTW_EXTERN void XM(execute_dft_c2r)(X(plan) p, C *in, R *out);	\
+FFTW_EXTERN void XM(execute_r2r)(X(plan) p, R *in, R *out); 
+
+
+
+/* end of FFTW_MPI_DEFINE_API macro */
+
+#define FFTW_MPI_MANGLE_DOUBLE(name) FFTW_MANGLE_DOUBLE(FFTW_CONCAT(mpi_,name))
+#define FFTW_MPI_MANGLE_FLOAT(name) FFTW_MANGLE_FLOAT(FFTW_CONCAT(mpi_,name))
+#define FFTW_MPI_MANGLE_LONG_DOUBLE(name) FFTW_MANGLE_LONG_DOUBLE(FFTW_CONCAT(mpi_,name))
+
+FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_DOUBLE, FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_FLOAT, FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_LONG_DOUBLE, FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+#define FFTW_MPI_DEFAULT_BLOCK (0)
+
+/* MPI-specific flags */
+#define FFTW_MPI_SCRAMBLED_IN (1U << 27)
+#define FFTW_MPI_SCRAMBLED_OUT (1U << 28)
+#define FFTW_MPI_TRANSPOSED_IN (1U << 29)
+#define FFTW_MPI_TRANSPOSED_OUT (1U << 30)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_MPI_H */
--- a/fftw-3.3.10/mpi/fftw3l-mpi.f03.in
+++ b/fftw-3.3.10/mpi/fftw3l-mpi.f03.in
@@ -0,0 +1,405 @@
+! Generated automatically.  DO NOT EDIT!
+
+  include 'fftw3l.f03'
+
+
+  type, bind(C) :: fftwl_mpi_ddim
+     integer(C_INTPTR_T) n, ib, ob
+  end type fftwl_mpi_ddim
+
+  interface
+    subroutine fftwl_mpi_init() bind(C, name='fftwl_mpi_init')
+      import
+    end subroutine fftwl_mpi_init
+    
+    subroutine fftwl_mpi_cleanup() bind(C, name='fftwl_mpi_cleanup')
+      import
+    end subroutine fftwl_mpi_cleanup
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
+                                                                      local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_many_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_many_transposed
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwl_mpi_local_size_many_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size_many
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_transposed_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_transposed
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwl_mpi_local_size_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
+                                                              local_o_start) bind(C, name='fftwl_mpi_local_size_many_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwl_mpi_local_size_many_1d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
+                                 bind(C, name='fftwl_mpi_local_size_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+      integer(C_INTPTR_T), intent(out) :: local_ni
+      integer(C_INTPTR_T), intent(out) :: local_i_start
+      integer(C_INTPTR_T), intent(out) :: local_no
+      integer(C_INTPTR_T), intent(out) :: local_o_start
+    end function fftwl_mpi_local_size_1d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwl_mpi_local_size_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size_2d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_2d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_2d_transposed
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
+                                 bind(C, name='fftwl_mpi_local_size_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+    end function fftwl_mpi_local_size_3d
+    
+    integer(C_INTPTR_T) function fftwl_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
+                                 bind(C, name='fftwl_mpi_local_size_3d_transposed_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INTPTR_T), intent(out) :: local_n0
+      integer(C_INTPTR_T), intent(out) :: local_0_start
+      integer(C_INTPTR_T), intent(out) :: local_n1
+      integer(C_INTPTR_T), intent(out) :: local_1_start
+    end function fftwl_mpi_local_size_3d_transposed
+    
+    type(C_PTR) function fftwl_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block0
+      integer(C_INTPTR_T), value :: block1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_transpose
+    
+    type(C_PTR) function fftwl_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_transpose_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_transpose
+    
+    type(C_PTR) function fftwl_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: block
+      integer(C_INTPTR_T), value :: tblock
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_dft
+    
+    type(C_PTR) function fftwl_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_1d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_1d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: sign
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_3d
+    
+    type(C_PTR) function fftwl_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_r2r
+    
+    type(C_PTR) function fftwl_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwl_mpi_plan_r2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_r2r
+    
+    type(C_PTR) function fftwl_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwl_mpi_plan_r2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_r2r_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
+                         bind(C, name='fftwl_mpi_plan_r2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_FFTW_R2R_KIND), value :: kind0
+      integer(C_FFTW_R2R_KIND), value :: kind1
+      integer(C_FFTW_R2R_KIND), value :: kind2
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_r2r_3d
+    
+    type(C_PTR) function fftwl_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_dft_r2c
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_r2c
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_r2c_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_r2c_3d
+    
+    type(C_PTR) function fftwl_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
+                         bind(C, name='fftwl_mpi_plan_many_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      integer(C_INTPTR_T), value :: howmany
+      integer(C_INTPTR_T), value :: iblock
+      integer(C_INTPTR_T), value :: oblock
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_many_dft_c2r
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_f03')
+      import
+      integer(C_INT), value :: rnk
+      integer(C_INTPTR_T), dimension(*), intent(in) :: n
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_c2r
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_2d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_c2r_2d
+    
+    type(C_PTR) function fftwl_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_3d_f03')
+      import
+      integer(C_INTPTR_T), value :: n0
+      integer(C_INTPTR_T), value :: n1
+      integer(C_INTPTR_T), value :: n2
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+      integer(C_MPI_FINT), value :: comm
+      integer(C_INT), value :: flags
+    end function fftwl_mpi_plan_dft_c2r_3d
+    
+    subroutine fftwl_mpi_gather_wisdom(comm_) bind(C, name='fftwl_mpi_gather_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwl_mpi_gather_wisdom
+    
+    subroutine fftwl_mpi_broadcast_wisdom(comm_) bind(C, name='fftwl_mpi_broadcast_wisdom_f03')
+      import
+      integer(C_MPI_FINT), value :: comm_
+    end subroutine fftwl_mpi_broadcast_wisdom
+    
+    subroutine fftwl_mpi_execute_dft(p,in,out) bind(C, name='fftwl_mpi_execute_dft')
+      import
+      type(C_PTR), value :: p
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_dft
+    
+    subroutine fftwl_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwl_mpi_execute_dft_r2c')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_dft_r2c
+    
+    subroutine fftwl_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwl_mpi_execute_dft_c2r')
+      import
+      type(C_PTR), value :: p
+      complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_dft_c2r
+    
+    subroutine fftwl_mpi_execute_r2r(p,in,out) bind(C, name='fftwl_mpi_execute_r2r')
+      import
+      type(C_PTR), value :: p
+      real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
+      real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
+    end subroutine fftwl_mpi_execute_r2r
+    
+  end interface
--- a/fftw-3.3.10/mpi/genf03-wrap.pl
+++ b/fftw-3.3.10/mpi/genf03-wrap.pl
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+# Generate Fortran 2003 wrappers (which translate MPI_Comm from f2c) from
+# function declarations of the form (one per line):
+#     extern <type> fftw_mpi_<name>(...args...)
+#     extern <type> fftw_mpi_<name>(...args...)
+#     ...
+# with no line breaks within a given function.  (It's too much work to
+# write a general parser, since we just have to handle FFTW's header files.)
+# Each declaration has at least one MPI_Comm argument.
+
+sub canonicalize_type {
+    my($type);
+    ($type) = @_;
+    $type =~ s/ +/ /g;
+    $type =~ s/^ //;
+    $type =~ s/ $//;
+    $type =~ s/([^\* ])\*/$1 \*/g;
+    $type =~ s/double/R/;
+    $type =~ s/fftw_([A-Za-z0-9_]+)/X(\1)/;
+    return $type;
+}
+
+while (<>) {
+    next if /^ *$/;
+    if (/^ *extern +([a-zA-Z_0-9 ]+[ \*]) *fftw_mpi_([a-zA-Z_0-9]+) *\((.*)\) *$/) {
+	$ret = &canonicalize_type($1);
+	$name = $2;
+
+	$args = $3;
+
+	
+	print "\n$ret XM(${name}_f03)(";
+
+	$comma = "";
+	foreach $arg (split(/ *, */, $args)) {
+            $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+            $argtype = &canonicalize_type($1);
+            $argname = $2;
+	    print $comma;
+	    if ($argtype eq "MPI_Comm") {
+		print "MPI_Fint f_$argname";
+	    }
+	    else {
+		print "$argtype $argname";
+	    }
+	    $comma = ", ";
+        }
+	print ")\n{\n";
+
+	print "     MPI_Comm ";
+	$comma = "";
+	foreach $arg (split(/ *, */, $args)) {
+            $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+            $argtype = &canonicalize_type($1);
+            $argname = $2;
+	    if ($argtype eq "MPI_Comm") {
+		print "$comma$argname";
+		$comma = ", ";
+	    }
+        }
+	print ";\n\n";
+
+	foreach $arg (split(/ *, */, $args)) {
+            $arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
+            $argtype = &canonicalize_type($1);
+            $argname = $2;
+            if ($argtype eq "MPI_Comm") {
+                print "     $argname = MPI_Comm_f2c(f_$argname);\n";
+            }
+        }
+
+	$argnames = $args;
+	$argnames =~ s/([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) */$2/g;
+	print "     ";
+	print "return " if ($ret ne "void");
+	print "XM($name)($argnames);\n}\n";
+    }
+}
--- a/fftw-3.3.10/mpi/ifftw-mpi.h
+++ b/fftw-3.3.10/mpi/ifftw-mpi.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* FFTW-MPI internal header file */
+#ifndef __IFFTW_MPI_H__
+#define __IFFTW_MPI_H__
+
+#include "kernel/ifftw.h"
+#include "rdft/rdft.h"
+
+#include <mpi.h>
+
+/* mpi problem flags: problem-dependent meaning, but in general
+   SCRAMBLED means some reordering *within* the dimensions, while
+   TRANSPOSED means some reordering *of* the dimensions */
+#define SCRAMBLED_IN (1 << 0)
+#define SCRAMBLED_OUT (1 << 1)
+#define TRANSPOSED_IN (1 << 2)
+#define TRANSPOSED_OUT (1 << 3)
+#define RANK1_BIGVEC_ONLY (1 << 4) /* for rank=1, allow only bigvec solver */
+
+#define ONLY_SCRAMBLEDP(flags) (!((flags) & ~(SCRAMBLED_IN|SCRAMBLED_OUT)))
+#define ONLY_TRANSPOSEDP(flags) (!((flags) & ~(TRANSPOSED_IN|TRANSPOSED_OUT)))
+
+#if defined(FFTW_SINGLE)
+#  define FFTW_MPI_TYPE MPI_FLOAT
+#elif defined(FFTW_LDOUBLE)
+#  define FFTW_MPI_TYPE MPI_LONG_DOUBLE
+#elif defined(FFTW_QUAD)
+#  error MPI quad-precision type is unknown
+#else
+#  define FFTW_MPI_TYPE MPI_DOUBLE
+#endif
+
+/* all fftw-mpi identifiers start with fftw_mpi (or fftwf_mpi etc.) */
+#define XM(name) X(CONCAT(mpi_, name))
+
+/***********************************************************************/
+/* block distributions */
+
+/* a distributed dimension of length n with input and output block
+   sizes ib and ob, respectively. */
+typedef enum { IB = 0, OB } block_kind;
+typedef struct {
+     INT n;
+     INT b[2]; /* b[IB], b[OB] */
+} ddim;
+
+/* Loop over k in {IB, OB}.  Note: need explicit casts for C++. */
+#define FORALL_BLOCK_KIND(k) for (k = IB; k <= OB; k = (block_kind) (((int) k) + 1))
+
+/* unlike tensors in the serial FFTW, the ordering of the dtensor
+   dimensions matters - both the array and the block layout are
+   row-major order. */
+typedef struct {
+     int rnk;
+#if defined(STRUCT_HACK_KR)
+     ddim dims[1];
+#elif defined(STRUCT_HACK_C99)
+     ddim dims[];
+#else
+     ddim *dims;
+#endif
+} dtensor;
+
+
+/* dtensor.c: */
+dtensor *XM(mkdtensor)(int rnk);
+void XM(dtensor_destroy)(dtensor *sz);
+dtensor *XM(dtensor_copy)(const dtensor *sz);
+dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress);
+int XM(dtensor_validp)(const dtensor *sz);
+void XM(dtensor_md5)(md5 *p, const dtensor *t);
+void XM(dtensor_print)(const dtensor *t, printer *p);
+
+/* block.c: */
+
+/* for a single distributed dimension: */
+INT XM(num_blocks)(INT n, INT block);
+int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm);
+INT XM(default_block)(INT n, int n_pes);
+INT XM(block)(INT n, INT block, int which_block);
+
+/* for multiple distributed dimensions: */
+INT XM(num_blocks_total)(const dtensor *sz, block_kind k);
+int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe);
+void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe, 
+		     INT *coords);
+INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe);
+int XM(is_local_after)(int dim, const dtensor *sz, block_kind k);
+int XM(is_local)(const dtensor *sz, block_kind k);
+int XM(is_block1d)(const dtensor *sz, block_kind k);
+
+/* choose-radix.c */
+INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
+                     INT rblock[2], INT mblock[2]);
+
+/***********************************************************************/
+/* any_true.c */
+int XM(any_true)(int condition, MPI_Comm comm);
+int XM(md5_equal)(md5 m, MPI_Comm comm);
+
+/* conf.c */
+void XM(conf_standard)(planner *p);
+
+/***********************************************************************/
+/* rearrange.c */
+
+/* Different ways to rearrange the vector dimension vn during transposition,
+   reflecting different tradeoffs between ease of transposition and
+   contiguity during the subsequent DFTs.
+
+   TODO: can we pare this down to CONTIG and DISCONTIG, at least
+   in MEASURE mode?  SQUARE_MIDDLE is also used for 1d destroy-input DFTs. */
+typedef enum {
+     CONTIG = 0, /* vn x 1: make subsequent DFTs contiguous */
+     DISCONTIG, /* P x (vn/P) for P processes */
+     SQUARE_BEFORE, /* try to get square transpose at beginning */
+     SQUARE_MIDDLE, /* try to get square transpose in the middle */
+     SQUARE_AFTER /* try to get square transpose at end */
+} rearrangement;
+
+/* skipping SQUARE_AFTER since it doesn't seem to offer any advantage
+   over SQUARE_BEFORE */
+#define FORALL_REARRANGE(rearrange) for (rearrange = CONTIG; rearrange <= SQUARE_MIDDLE; rearrange = (rearrangement) (((int) rearrange) + 1))
+
+int XM(rearrange_applicable)(rearrangement rearrange, 
+			     ddim dim0, INT vn, int n_pes);
+INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes);
+
+/***********************************************************************/
+
+#endif /* __IFFTW_MPI_H__ */
+
--- a/fftw-3.3.10/mpi/mpi-bench.c
+++ b/fftw-3.3.10/mpi/mpi-bench.c
@@ -0,0 +1,844 @@
+/**************************************************************************/
+/* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
+   It is probably NOT a good place to learn FFTW usage, since it has a
+   lot of added complexity in order to exercise and test the full API,
+   etcetera.  We suggest reading the manual. */
+/**************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "fftw3-mpi.h"
+#include "tests/fftw-bench.h"
+
+#if defined(BENCHFFT_SINGLE)
+#  define BENCH_MPI_TYPE MPI_FLOAT
+#elif defined(BENCHFFT_LDOUBLE)
+#  define BENCH_MPI_TYPE MPI_LONG_DOUBLE
+#elif defined(BENCHFFT_QUAD)
+#  error MPI quad-precision type is unknown
+#else
+#  define BENCH_MPI_TYPE MPI_DOUBLE
+#endif
+
+#if SIZEOF_PTRDIFF_T == SIZEOF_INT
+#  define FFTW_MPI_PTRDIFF_T MPI_INT
+#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
+#  define FFTW_MPI_PTRDIFF_T MPI_LONG
+#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
+#  define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
+#else
+#  error MPI type for ptrdiff_t is unknown
+#  define FFTW_MPI_PTRDIFF_T MPI_LONG
+#endif
+
+static const char *mkversion(void) { return FFTW(version); }
+static const char *mkcc(void) { return FFTW(cc); }
+static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
+static const char *mknproc(void) {
+     static char buf[32];
+     int ncpus;
+     MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
+#ifdef HAVE_SNPRINTF
+     snprintf(buf, 32, "%d", ncpus);
+#else
+     sprintf(buf, "%d", ncpus);
+#endif
+     return buf;
+}
+
+BEGIN_BENCH_DOC
+BENCH_DOC("name", "fftw3_mpi")
+BENCH_DOCF("version", mkversion)
+BENCH_DOCF("cc", mkcc)
+BENCH_DOCF("codelet-optim", mkcodelet_optim)
+BENCH_DOCF("nproc", mknproc)
+END_BENCH_DOC 
+
+static int n_pes = 1, my_pe = 0;
+
+/* global variables describing the shape of the data and its distribution */
+static int rnk;
+static ptrdiff_t vn, iNtot, oNtot;
+static ptrdiff_t *local_ni=0, *local_starti=0;
+static ptrdiff_t *local_no=0, *local_starto=0;
+static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
+static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
+static ptrdiff_t *istrides = 0, *ostrides = 0;
+static ptrdiff_t *total_ni=0, *total_no=0;
+static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
+static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
+
+static bench_real *local_in = 0, *local_out = 0;
+static bench_real *all_local_in = 0, *all_local_out = 0;
+static int all_local_in_alloc = 0, all_local_out_alloc = 0;
+static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
+
+static void alloc_rnk(int rnk_) {
+     rnk = rnk_;
+     bench_free(local_ni);
+     if (rnk == 0)
+	  local_ni = 0;
+     else
+	  local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
+						* (8 + n_pes * 4));
+
+     local_starti = local_ni + rnk;
+     local_no = local_ni + 2 * rnk;
+     local_starto = local_ni + 3 * rnk;
+     istrides = local_ni + 4 * rnk;
+     ostrides = local_ni + 5 * rnk;
+     total_ni = local_ni + 6 * rnk;
+     total_no = local_ni + 7 * rnk;
+     all_local_ni = local_ni + 8 * rnk;
+     all_local_starti = local_ni + (8 + n_pes) * rnk;
+     all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
+     all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
+}
+
+static void setup_gather_scatter(void)
+{
+     int i, j;
+     ptrdiff_t off;
+
+     MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+     MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+
+     MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+     MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
+		all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
+		0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
+
+     off = 0;
+     for (i = 0; i < n_pes; ++i) {
+	  ptrdiff_t N = vn;
+	  for (j = 0; j < rnk; ++j)
+	       N *= all_local_ni[i * rnk + j];
+	  isend_cnt[i] = N;
+	  isend_off[i] = off;
+	  off += N;
+     }
+     iNtot = off;
+     all_local_in_alloc = 1;
+
+     istrides[rnk - 1] = vn;
+     for (j = rnk - 2; j >= 0; --j)
+	  istrides[j] = total_ni[j + 1] * istrides[j + 1];
+
+     off = 0;
+     for (i = 0; i < n_pes; ++i) {
+	  ptrdiff_t N = vn;
+	  for (j = 0; j < rnk; ++j)
+	       N *= all_local_no[i * rnk + j];
+	  orecv_cnt[i] = N;
+	  orecv_off[i] = off;
+	  off += N;
+     }
+     oNtot = off;
+     all_local_out_alloc = 1;
+
+     ostrides[rnk - 1] = vn;
+     for (j = rnk - 2; j >= 0; --j)
+	  ostrides[j] = total_no[j + 1] * ostrides[j + 1];
+}
+
+static void copy_block_out(const bench_real *in,
+			   int rnk, ptrdiff_t *n, ptrdiff_t *start, 
+			   ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
+			   bench_real *out)
+{
+     ptrdiff_t i;
+     if (rnk == 0) { 
+	  for (i = 0; i < vn; ++i)
+	       out[i] = in[i];
+     }
+     else if (rnk == 1) { /* this case is just an optimization */
+	  ptrdiff_t j;
+	  out += start[0] * os[0];
+	  for (j = 0; j < n[0]; ++j) {
+	       for (i = 0; i < vn; ++i)
+		    out[i] = in[i];
+	       in += is;
+	       out += os[0];
+	  }
+     }
+     else {
+	  /* we should do n[0] for locality, but this way is simpler to code */
+	  for (i = 0; i < n[rnk - 1]; ++i) 
+	       copy_block_out(in + i * is,
+			      rnk - 1, n, start, is * n[rnk - 1], os, vn,
+			      out + (start[rnk - 1] + i) * os[rnk - 1]);
+     }
+}
+
+static void copy_block_in(bench_real *in,
+			  int rnk, ptrdiff_t *n, ptrdiff_t *start, 
+			  ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
+			  const bench_real *out)
+{
+     ptrdiff_t i;
+     if (rnk == 0) { 
+	  for (i = 0; i < vn; ++i)
+	       in[i] = out[i];
+     }
+     else if (rnk == 1) { /* this case is just an optimization */
+	  ptrdiff_t j;
+	  out += start[0] * os[0];
+	  for (j = 0; j < n[0]; ++j) {
+	       for (i = 0; i < vn; ++i)
+		    in[i] = out[i];
+	       in += is;
+	       out += os[0];
+	  }
+     }
+     else {
+	  /* we should do n[0] for locality, but this way is simpler to code */
+	  for (i = 0; i < n[rnk - 1]; ++i) 
+	       copy_block_in(in + i * is,
+			     rnk - 1, n, start, is * n[rnk - 1], os, vn,
+			     out + (start[rnk - 1] + i) * os[rnk - 1]);
+     }
+}
+
+static void do_scatter_in(bench_real *in)
+{
+     bench_real *ali;
+     int i;
+     if (all_local_in_alloc) {
+          bench_free(all_local_in);
+	  all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
+	  all_local_in_alloc = 0;
+     }
+     ali = all_local_in;
+     for (i = 0; i < n_pes; ++i) {
+	  copy_block_in(ali,
+			rnk, all_local_ni + i * rnk, 
+			all_local_starti + i * rnk,
+			vn, istrides, vn,
+			in);
+	  ali += isend_cnt[i];
+     }
+     MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
+		  local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
+		  0, MPI_COMM_WORLD);
+}
+
+static void do_gather_out(bench_real *out)
+{
+     bench_real *alo;
+     int i;
+
+     if (all_local_out_alloc) {
+          bench_free(all_local_out);
+	  all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
+	  all_local_out_alloc = 0;
+     }
+     MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
+		 all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
+		 0, MPI_COMM_WORLD);
+     MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
+     alo = all_local_out;
+     for (i = 0; i < n_pes; ++i) {
+	  copy_block_out(alo,
+			 rnk, all_local_no + i * rnk, 
+			 all_local_starto + i * rnk,
+			 vn, ostrides, vn,
+			 out);
+	  alo += orecv_cnt[i];
+     }
+}
+
+static void alloc_local(ptrdiff_t nreal, int inplace)
+{
+     bench_free(local_in);
+     if (local_out != local_in) bench_free(local_out);
+     local_in = local_out = 0;
+     if (nreal > 0) {
+	  ptrdiff_t i;
+	  local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
+	  if (inplace)
+	       local_out = local_in;
+	  else
+	       local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
+	  for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
+     }
+}
+
+void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
+{
+     UNUSED(p);
+     do_scatter_in(ri);
+     if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
+}
+
+void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
+{
+     UNUSED(p);
+     if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
+     do_gather_out(ro);
+}
+
+void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(ii);
+     after_problem_rcopy_from(p, ri);
+}
+
+void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(io);
+     after_problem_rcopy_to(p, ro);
+}
+
+void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
+{
+     UNUSED(ii);
+     after_problem_rcopy_from(p, ri);
+}
+
+void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
+{
+     UNUSED(io);
+     after_problem_rcopy_to(p, ro);
+}
+
+static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny, 
+					 ptrdiff_t vn, 
+					 bench_real *in, bench_real *out)
+{
+     FFTW(iodim64) hdims[3];
+     FFTW(r2r_kind) k[3];
+     FFTW(plan) pln;
+
+     hdims[0].n = nx;
+     hdims[0].is = ny * vn;
+     hdims[0].os = vn;
+     hdims[1].n = ny;
+     hdims[1].is = vn;
+     hdims[1].os = nx * vn;
+     hdims[2].n = vn;
+     hdims[2].is = 1;
+     hdims[2].os = 1;
+     k[0] = k[1] = k[2] = FFTW_R2HC;
+     pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
+     BENCH_ASSERT(pln != 0);
+     return pln;
+}
+
+static int tensor_rowmajor_transposedp(bench_tensor *t)
+{
+     bench_iodim *d;
+     int i;
+
+     BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
+     if (t->rnk < 2)
+	  return 0;
+
+     d = t->dims;
+     if (d[0].is != d[1].is * d[1].n
+	 || d[0].os != d[1].is
+	 || d[1].os != d[0].os * d[0].n)
+	  return 0;
+     if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
+	  return 0;
+     for (i = 2; i + 1 < t->rnk; ++i) {
+          d = t->dims + i;
+          if (d[0].is != d[1].is * d[1].n
+	      || d[0].os != d[1].os * d[1].n)
+               return 0;
+     }
+
+     if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
+	  return 0;
+     return 1;
+}
+
+static int tensor_contiguousp(bench_tensor *t, int s)
+{
+     return (t->dims[t->rnk-1].is == s
+	     && ((tensor_rowmajorp(t) && 
+		  t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
+		 || tensor_rowmajor_transposedp(t)));
+}
+
+static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     int i; 
+     ptrdiff_t ntot;
+
+     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
+
+     if (p->sz->rnk < 1
+	 || p->split
+	 || !tensor_contiguousp(p->sz, vn)
+	 || tensor_rowmajor_transposedp(p->sz)
+	 || p->vecsz->rnk > 1
+	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
+				    || p->vecsz->dims[0].os != 1)))
+	  return 0;
+
+     alloc_rnk(p->sz->rnk);
+     for (i = 0; i < rnk; ++i) {
+	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
+	  local_ni[i] = local_no[i] = total_ni[i];
+	  local_starti[i] = local_starto[i] = 0;
+     }
+     if (rnk > 1) {
+	  ptrdiff_t n, start, nT, startT;
+	  ntot = FFTW(mpi_local_size_many_transposed)
+	       (p->sz->rnk, total_ni, vn,
+		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+		MPI_COMM_WORLD,
+		&n, &start, &nT, &startT);
+	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
+	       local_ni[1] = nT;
+	       local_starti[1] = startT;
+	  }
+	  else {
+	       local_ni[0] = n;
+	       local_starti[0] = start;
+	  }
+	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
+	       local_no[1] = nT;
+	       local_starto[1] = startT;
+	  }
+	  else {
+	       local_no[0] = n;
+	       local_starto[0] = start;
+	  }
+     }
+     else if (rnk == 1) {
+	  ntot = FFTW(mpi_local_size_many_1d)
+	       (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
+		local_ni, local_starti, local_no, local_starto);
+     }
+     alloc_local(ntot * 2, p->in == p->out);
+
+     pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn, 
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   (FFTW(complex) *) local_in, 
+				   (FFTW(complex) *) local_out,
+				   MPI_COMM_WORLD, p->sign, flags);
+
+     vn *= 2;
+
+     if (rnk > 1) {
+	  ptrdiff_t nrest = 1;
+	  for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
+	  if (flags & FFTW_MPI_TRANSPOSED_IN)
+	       plan_scramble_in = mkplan_transpose_local(
+		    p->sz->dims[0].n, local_ni[1], vn * nrest,
+		    local_in, local_in);
+	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	       plan_unscramble_out = mkplan_transpose_local(
+		    local_no[1], p->sz->dims[0].n, vn * nrest,
+		    local_out, local_out);
+     }
+     
+     return pln;
+}
+
+static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
+{
+     return (t->dims[t->rnk-1].is == s
+	     && ((tensor_real_rowmajorp(t, sign, 1) && 
+		  t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
+}
+
+static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     int i; 
+     ptrdiff_t ntot;
+
+     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
+
+     if (p->sz->rnk < 2
+	 || p->split
+	 || !tensor_real_contiguousp(p->sz, p->sign, vn)
+	 || tensor_rowmajor_transposedp(p->sz)
+	 || p->vecsz->rnk > 1
+	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
+				    || p->vecsz->dims[0].os != 1)))
+	  return 0;
+
+     alloc_rnk(p->sz->rnk);
+     for (i = 0; i < rnk; ++i) {
+	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
+	  local_ni[i] = local_no[i] = total_ni[i];
+	  local_starti[i] = local_starto[i] = 0;
+     }
+     local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1] 
+	  = p->sz->dims[rnk-1].n / 2 + 1;
+     {
+	  ptrdiff_t n, start, nT, startT;
+	  ntot = FFTW(mpi_local_size_many_transposed)
+	       (p->sz->rnk, total_ni, vn,
+		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+		MPI_COMM_WORLD,
+		&n, &start, &nT, &startT);
+	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
+	       local_ni[1] = nT;
+	       local_starti[1] = startT;
+	  }
+	  else {
+	       local_ni[0] = n;
+	       local_starti[0] = start;
+	  }
+	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
+	       local_no[1] = nT;
+	       local_starto[1] = startT;
+	  }
+	  else {
+	       local_no[0] = n;
+	       local_starto[0] = start;
+	  }
+     }
+     alloc_local(ntot * 2, p->in == p->out);
+
+     total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
+     if (p->sign < 0)
+	  pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn, 
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    local_in, 
+					    (FFTW(complex) *) local_out,
+					    MPI_COMM_WORLD, flags);
+     else
+	  pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn, 
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    FFTW_MPI_DEFAULT_BLOCK,
+					    (FFTW(complex) *) local_in, 
+					    local_out,
+					    MPI_COMM_WORLD, flags);
+
+     total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
+     vn *= 2;
+
+     {
+	  ptrdiff_t nrest = 1;
+	  for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
+	  if (flags & FFTW_MPI_TRANSPOSED_IN)
+	       plan_scramble_in = mkplan_transpose_local(
+		    total_ni[0], local_ni[1], vn * nrest,
+		    local_in, local_in);
+	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	       plan_unscramble_out = mkplan_transpose_local(
+		    local_no[1], total_ni[0], vn * nrest,
+		    local_out, local_out);
+     }
+     
+     return pln;
+}
+
+static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
+{
+     ptrdiff_t ntot, nx, ny;
+     int ix=0, iy=1, i;
+     const bench_iodim *d = p->vecsz->dims;
+     FFTW(plan) pln;
+
+     if (p->vecsz->rnk == 3) {
+	  for (i = 0; i < 3; ++i)
+	       if (d[i].is == 1 && d[i].os == 1) {
+		    vn = d[i].n;
+		    ix = (i + 1) % 3;
+		    iy = (i + 2) % 3;
+		    break;
+	       }
+	  if (i == 3) return 0;
+     }
+     else {
+	  vn = 1;
+	  ix = 0;
+	  iy = 1;
+     }
+
+     if (d[ix].is == d[iy].n * vn && d[ix].os == vn
+	 && d[iy].os == d[ix].n * vn && d[iy].is == vn) {
+	  nx = d[ix].n;
+	  ny = d[iy].n;
+     }
+     else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
+	      && d[ix].os == d[iy].n * vn && d[ix].is == vn) {
+	  nx = d[iy].n;
+	  ny = d[ix].n;
+     }
+     else
+	  return 0;
+
+     alloc_rnk(2);
+     ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
+						    &local_ni[0], 
+						    &local_starti[0],
+						    &local_no[0], 
+						    &local_starto[0]);
+     local_ni[1] = ny;
+     local_starti[1] = 0;
+     local_no[1] = nx;
+     local_starto[1] = 0;
+     total_ni[0] = nx; total_ni[1] = ny;
+     total_no[1] = nx; total_no[0] = ny;
+     alloc_local(ntot, p->in == p->out);
+
+     pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
+					 FFTW_MPI_DEFAULT_BLOCK,
+					 FFTW_MPI_DEFAULT_BLOCK,
+					 local_in, local_out,
+					 MPI_COMM_WORLD, flags);
+     
+     if (flags & FFTW_MPI_TRANSPOSED_IN)
+	  plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
+						    local_in, local_in);
+     if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	  plan_unscramble_out = mkplan_transpose_local
+	       (nx, local_no[0], vn, local_out, local_out);
+     
+#if 0
+     if (pln && vn == 1) {
+	  int i, j;
+	  bench_real *ri = (bench_real *) p->in;
+	  bench_real *ro = (bench_real *) p->out;
+	  if (!ri || !ro) return pln;
+	  setup_gather_scatter();
+	  for (i = 0; i < nx * ny; ++i)
+	       ri[i] = i;
+	  after_problem_rcopy_from(p, ri);
+	  FFTW(execute)(pln);
+	  after_problem_rcopy_to(p, ro);
+	  if (my_pe == 0) {
+	       for (i = 0; i < nx; ++i) {
+		    for (j = 0; j < ny; ++j)
+			 printf("  %3g", ro[j * nx + i]);
+		    printf("\n");
+	       }
+	  }
+     }
+#endif
+
+     return pln;
+}
+
+static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     int i; 
+     ptrdiff_t ntot;
+     FFTW(r2r_kind) *k;
+
+     if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
+	 && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
+	  return mkplan_transpose(p, flags);
+
+     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
+
+     if (p->sz->rnk < 1
+	 || p->split
+	 || !tensor_contiguousp(p->sz, vn)
+	 || tensor_rowmajor_transposedp(p->sz)
+	 || p->vecsz->rnk > 1
+	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
+				    || p->vecsz->dims[0].os != 1)))
+	  return 0;
+
+     alloc_rnk(p->sz->rnk);
+     for (i = 0; i < rnk; ++i) {
+	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
+	  local_ni[i] = local_no[i] = total_ni[i];
+	  local_starti[i] = local_starto[i] = 0;
+     }
+     if (rnk > 1) {
+	  ptrdiff_t n, start, nT, startT;
+	  ntot = FFTW(mpi_local_size_many_transposed)
+	       (p->sz->rnk, total_ni, vn,
+		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+		MPI_COMM_WORLD,
+		&n, &start, &nT, &startT);
+	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
+	       local_ni[1] = nT;
+	       local_starti[1] = startT;
+	  }
+	  else {
+	       local_ni[0] = n;
+	       local_starti[0] = start;
+	  }
+	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
+	       local_no[1] = nT;
+	       local_starto[1] = startT;
+	  }
+	  else {
+	       local_no[0] = n;
+	       local_starto[0] = start;
+	  }
+     }
+     else if (rnk == 1) {
+	  ntot = FFTW(mpi_local_size_many_1d)
+	       (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
+		local_ni, local_starti, local_no, local_starto);
+     }
+     alloc_local(ntot, p->in == p->out);
+
+     k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
+     for (i = 0; i < p->sz->rnk; ++i)
+	  switch (p->k[i]) {
+	      case R2R_R2HC: k[i] = FFTW_R2HC; break;
+	      case R2R_HC2R: k[i] = FFTW_HC2R; break;
+	      case R2R_DHT: k[i] = FFTW_DHT; break;
+	      case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
+	      case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
+	      case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
+	      case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
+	      case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
+	      case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
+	      case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
+	      case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
+	      default: BENCH_ASSERT(0);
+	  }
+
+     pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn, 
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   FFTW_MPI_DEFAULT_BLOCK,
+				   local_in, local_out,
+				   MPI_COMM_WORLD, k, flags);
+     bench_free(k);
+
+     if (rnk > 1) {
+	  ptrdiff_t nrest = 1;
+	  for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
+	  if (flags & FFTW_MPI_TRANSPOSED_IN)
+	       plan_scramble_in = mkplan_transpose_local(
+		    p->sz->dims[0].n, local_ni[1], vn * nrest,
+		    local_in, local_in);
+	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
+	       plan_unscramble_out = mkplan_transpose_local(
+		    local_no[1], p->sz->dims[0].n, vn * nrest,
+		    local_out, local_out);
+     }
+     
+     return pln;
+}
+
+FFTW(plan) mkplan(bench_problem *p, unsigned flags)
+{
+     FFTW(plan) pln = 0;
+     FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
+     FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
+     if (p->scrambled_in) {
+	  if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) 
+	       flags |= FFTW_MPI_SCRAMBLED_IN;
+	  else
+	       flags |= FFTW_MPI_TRANSPOSED_IN;
+     }
+     if (p->scrambled_out) {
+	  if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) 
+	       flags |= FFTW_MPI_SCRAMBLED_OUT;
+	  else
+	       flags |= FFTW_MPI_TRANSPOSED_OUT;
+     }
+     switch (p->kind) {
+         case PROBLEM_COMPLEX: 
+	      pln =mkplan_complex(p, flags);
+	      break;
+         case PROBLEM_REAL: 
+	      pln = mkplan_real(p, flags);
+	      break;
+         case PROBLEM_R2R:
+	      pln = mkplan_r2r(p, flags);
+	      break;
+         default: BENCH_ASSERT(0);
+     }
+     if (pln) setup_gather_scatter();
+     return pln;
+}
+
+void main_init(int *argc, char ***argv)
+{
+#ifdef HAVE_SMP
+# if MPI_VERSION >= 2 /* for MPI_Init_thread */
+     int provided;
+     MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
+     threads_ok = provided >= MPI_THREAD_FUNNELED;
+# else
+     MPI_Init(argc, argv);
+     threads_ok = 0;
+# endif
+#else
+     MPI_Init(argc, argv);
+#endif
+     MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
+     MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
+     if (my_pe != 0) verbose = -999;
+     no_speed_allocation = 1; /* so we can benchmark transforms > memory */
+     always_pad_real = 1; /* out-of-place real transforms are padded */
+     isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
+     isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
+     orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
+     orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
+
+     /* init_threads must be called before any other FFTW function,
+	including mpi_init, because it has to register the threads hooks
+	before the planner is initalized */
+#ifdef HAVE_SMP
+     if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
+#endif
+     FFTW(mpi_init)();
+}
+
+void initial_cleanup(void)
+{
+     alloc_rnk(0);
+     alloc_local(0, 0);
+     bench_free(all_local_in); all_local_in = 0;
+     bench_free(all_local_out); all_local_out = 0;
+     bench_free(isend_off); isend_off = 0;
+     bench_free(isend_cnt); isend_cnt = 0;
+     bench_free(orecv_off); orecv_off = 0;
+     bench_free(orecv_cnt); orecv_cnt = 0;
+     FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
+     FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
+}
+
+void final_cleanup(void)
+{
+     MPI_Finalize();
+}
+
+void bench_exit(int status)
+{
+     MPI_Abort(MPI_COMM_WORLD, status);
+}
+
+double bench_cost_postprocess(double cost)
+{
+     double cost_max;
+     MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+     return cost_max;
+}
+
+
+int import_wisdom(FILE *f)
+{
+     int success = 1, sall;
+     if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
+     FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
+     MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
+     return sall;
+}
+
+void export_wisdom(FILE *f)
+{
+     FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
+     if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
+}
--- a/fftw-3.3.10/mpi/mpi-dft.h
+++ b/fftw-3.3.10/mpi/mpi-dft.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     dtensor *sz;
+     INT vn; /* vector length (vector stride 1) */
+     R *I, *O; /* contiguous interleaved arrays */
+
+     int sign; /* FFTW_FORWARD / FFTW_BACKWARD */
+     unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
+			SCRAMBLED_IN/OUT meaningful for 1d transforms only */
+
+     MPI_Comm comm;
+} problem_mpi_dft;
+
+problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
+			      R *I, R *O, MPI_Comm comm,
+			      int sign, unsigned flags);
+problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
+			     R *I, R *O, MPI_Comm comm,
+			     int sign, unsigned flags);
+
+/* solve.c: */
+void XM(dft_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_dft;
+#define MKPLAN_MPI_DFT(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+int XM(dft_serial_applicable)(const problem_mpi_dft *p);
+
+/* various solvers */
+void XM(dft_rank_geq2_register)(planner *p);
+void XM(dft_rank_geq2_transposed_register)(planner *p);
+void XM(dft_serial_register)(planner *p);
+void XM(dft_rank1_bigvec_register)(planner *p);
+void XM(dft_rank1_register)(planner *p);
--- a/fftw-3.3.10/mpi/mpi-rdft.h
+++ b/fftw-3.3.10/mpi/mpi-rdft.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     dtensor *sz;
+     INT vn; /* vector length (vector stride 1) */
+     R *I, *O; /* contiguous interleaved arrays */
+
+     
+     unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
+			SCRAMBLED_IN/OUT meaningful for 1d transforms only */
+
+     MPI_Comm comm;
+
+#if defined(STRUCT_HACK_KR)
+     rdft_kind kind[1];
+#elif defined(STRUCT_HACK_C99)
+     rdft_kind kind[];
+#else
+     rdft_kind *kind;
+#endif
+} problem_mpi_rdft;
+
+problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
+			    R *I, R *O, MPI_Comm comm, 
+			    const rdft_kind *kind, unsigned flags);
+problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
+			      R *I, R *O, MPI_Comm comm, 
+			      const rdft_kind *kind, unsigned flags);
+
+/* solve.c: */
+void XM(rdft_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_rdft;
+#define MKPLAN_MPI_RDFT(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+int XM(rdft_serial_applicable)(const problem_mpi_rdft *p);
+
+/* various solvers */
+void XM(rdft_rank_geq2_register)(planner *p);
+void XM(rdft_rank_geq2_transposed_register)(planner *p);
+void XM(rdft_serial_register)(planner *p);
+void XM(rdft_rank1_bigvec_register)(planner *p);
--- a/fftw-3.3.10/mpi/mpi-rdft2.h
+++ b/fftw-3.3.10/mpi/mpi-rdft2.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* r2c and c2r transforms.  The sz dtensor, as usual, gives the size
+   of the "logical" complex array.  For the last dimension N, however,
+   only N/2+1 complex numbers are stored for the complex data.  Moreover,
+   for the real data, the last dimension is *always* padded to a size
+   2*(N/2+1).  (Contrast this with the serial API, where there is only
+   padding for in-place plans.) */
+
+/* problem.c: */
+typedef struct {
+     problem super;
+     dtensor *sz;
+     INT vn; /* vector length (vector stride 1) */
+     R *I, *O; /* contiguous interleaved arrays */
+
+     rdft_kind kind; /* assert(kind < DHT) */
+     unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
+			SCRAMBLED_IN/OUT meaningful for 1d transforms only */
+
+     MPI_Comm comm;
+} problem_mpi_rdft2;
+
+problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
+			     R *I, R *O, MPI_Comm comm,
+			     rdft_kind kind, unsigned flags);
+problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
+			       R *I, R *O, MPI_Comm comm,
+			       rdft_kind kind, unsigned flags);
+
+/* solve.c: */
+void XM(rdft2_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_rdft2;
+#define MKPLAN_MPI_RDFT2(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p);
+
+/* various solvers */
+void XM(rdft2_rank_geq2_register)(planner *p);
+void XM(rdft2_rank_geq2_transposed_register)(planner *p);
+void XM(rdft2_serial_register)(planner *p);
--- a/fftw-3.3.10/mpi/mpi-transpose.h
+++ b/fftw-3.3.10/mpi/mpi-transpose.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* tproblem.c: */
+typedef struct {
+     problem super;
+     INT vn; /* vector length (vector stride 1) */
+     INT nx, ny; /* nx x ny transposed to ny x nx */
+     R *I, *O; /* contiguous real arrays (both same size!) */
+
+     unsigned flags; /* TRANSPOSED_IN: input is *locally* transposed
+			TRANSPOSED_OUT: output is *locally* transposed */
+
+     INT block, tblock; /* block size, slab decomposition;
+			   tblock is for transposed blocks on output */
+
+     MPI_Comm comm;
+} problem_mpi_transpose;
+
+problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
+				 R *I, R *O,
+				 INT block, INT tblock,
+				 MPI_Comm comm,
+				 unsigned flags);
+
+/* tsolve.c: */
+void XM(transpose_solve)(const plan *ego_, const problem *p_);
+
+/* plans have same operands as rdft plans, so just re-use */
+typedef plan_rdft plan_mpi_transpose;
+#define MKPLAN_MPI_TRANSPOSE(type, adt, apply) \
+  (type *)X(mkplan_rdft)(sizeof(type), adt, apply)
+
+/* transpose-pairwise.c: */
+int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
+			      R *I, R *O, int my_pe,
+			      plan **cld2, plan **cld2rest, plan **cld3,
+			      INT *rest_Ioff, INT *rest_Ooff);
+/* various solvers */
+void XM(transpose_pairwise_register)(planner *p);
+void XM(transpose_alltoall_register)(planner *p);
+void XM(transpose_recurse_register)(planner *p);
--- a/fftw-3.3.10/mpi/rdft-problem.c
+++ b/fftw-3.3.10/mpi/rdft-problem.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_rdft *ego = (problem_mpi_rdft *) ego_;
+     XM(dtensor_destroy)(ego->sz);
+     MPI_Comm_free(&ego->comm);
+#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
+     X(ifree0)(ego->kind);
+#endif
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     int i;
+     X(md5puts)(m, "mpi-dft");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(ialignment_of)(p->I));
+	X(md5int)(m, X(ialignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     XM(dtensor_md5)(m, p->sz);
+     X(md5INT)(m, p->vn);
+     for (i = 0; i < p->sz->rnk; ++i)
+	  X(md5int)(m, p->kind[i]);
+     X(md5int)(m, p->flags);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
+     int i;
+     p->print(p, "(mpi-rdft %d %d %d ", 
+	      ego->I == ego->O,
+	      X(ialignment_of)(ego->I),
+	      X(ialignment_of)(ego->O));
+     XM(dtensor_print)(ego->sz, p);
+     for (i = 0; i < ego->sz->rnk; ++i)
+          p->print(p, " %d", (int)ego->kind[i]);
+     p->print(p, " %D %d", ego->vn, ego->flags);
+     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
+     R *I = ego->I;
+     INT i, N;
+     int my_pe;
+
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N = ego->vn * XM(total_block)(ego->sz, IB, my_pe);
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_RDFT,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
+			    R *I, R *O,
+			    MPI_Comm comm,
+			    const rdft_kind *kind, unsigned flags)
+{
+     problem_mpi_rdft *ego;
+     int i, rnk = sz->rnk;
+     int n_pes;
+
+     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
+     MPI_Comm_size(comm, &n_pes);
+     A(n_pes >= XM(num_blocks_total)(sz, IB)
+       && n_pes >= XM(num_blocks_total)(sz, OB));
+     A(vn >= 0);
+
+#if defined(STRUCT_HACK_KR)
+     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
+					     + sizeof(rdft_kind)
+					     * (rnk > 0 ? rnk - 1 : 0), &padt);
+#elif defined(STRUCT_HACK_C99)
+     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
+					     + sizeof(rdft_kind) * rnk, &padt);
+#else
+     ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt);
+     ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
+#endif
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->sz = XM(dtensor_canonical)(sz, 0);
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     for (i = 0; i< ego->sz->rnk; ++i)
+	  ego->kind[i] = kind[i];
+
+     /* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
+        swapping the first two dimensions (for rnk > 1) */
+     if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
+	  rdft_kind k = ego->kind[0];
+	  ddim dim0 = ego->sz->dims[0];
+	  ego->sz->dims[0] = ego->sz->dims[1];
+	  ego->sz->dims[1] = dim0;
+	  ego->kind[0] = ego->kind[1];
+	  ego->kind[1] = k;
+	  flags &= ~TRANSPOSED_IN;
+	  flags ^= TRANSPOSED_OUT;
+     }
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
+
+problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
+			      R *I, R *O, 
+			      MPI_Comm comm,
+			      const rdft_kind *kind, unsigned flags)
+{
+     problem *p = XM(mkproblem_rdft)(sz, vn, I, O, comm, kind, flags);
+     XM(dtensor_destroy)(sz);
+     return p;
+}
--- a/fftw-3.3.10/mpi/rdft-rank-geq2-transposed.c
+++ b/fftw-3.3.10/mpi/rdft-rank-geq2-transposed.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is transposed both
+   in data distribution and in ordering (for the first 2 dimensions).
+
+   (Note that we don't have to handle the case where the input is
+   transposed, since this is equivalent to transposed output with the
+   first two dimensions swapped, and is automatically canonicalized as
+   such by rdft-problem.c. */
+
+#include "mpi-rdft.h"
+#include "mpi-transpose.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft super;
+
+     plan *cld1, *cldt, *cld2;
+     INT roff, ioff;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cldt;
+     
+     /* RDFT local dimensions */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, O);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* RDFT final local dimension */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, O, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == TRANSPOSED_OUT
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(2, p->sz, OB)
+	     && XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
+	     && (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
+		 || !XM(rdft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldt, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft *p;
+     P *pln;
+     plan *cld1 = 0, *cldt = 0, *cld2 = 0;
+     R *I, *O, *I2;
+     tensor *sz;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft *) p_;
+
+     I2 = I = p->I;
+     O = p->O;
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft_d)(sz,
+						 X(mktensor_2d)(b, is, is,
+								p->vn, 1, 1),
+						 I2, I, p->kind + 1));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     nrest *= p->vn;
+     cldt = X(mkplan_d)(plnr,
+			XM(mkproblem_transpose)(
+			     p->sz->dims[0].n, p->sz->dims[1].n, nrest,
+			     I, O,
+			     p->sz->dims[0].b[IB], p->sz->dims[1].b[OB], 
+			     p->comm, 0));
+     if (XM(any_true)(!cldt, p->comm)) goto nada;
+
+     {
+	  INT is = p->sz->dims[0].n * nrest;
+	  INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
+	  cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_rdft_1_d)(X(mktensor_1d)(
+							p->sz->dims[0].n,
+							nrest, nrest),
+						   X(mktensor_2d)(b, is, is,
+								  nrest, 1, 1),
+						   O, O, p->kind[0]));
+	  if (XM(any_true)(!cld2, p->comm)) goto nada;
+     }
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cldt = cldt;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft_rank_geq2_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/rdft-rank-geq2.c
+++ b/fftw-3.3.10/mpi/rdft-rank-geq2.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is not transposed. */
+
+#include "mpi-rdft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft super;
+
+     plan *cld1, *cld2;
+     int preserve_input;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2;
+     
+     /* RDFT local dimensions */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, O);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I);
+
+     /* RDFT non-local dimension (via rdft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(1, p->sz, OB)
+	     && (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
+		 || !XM(rdft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft-rank-geq2%s%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     R *I, *O, *I2;
+     tensor *sz;
+     dtensor *sz2;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft *) p_;
+
+     I2 = I = p->I;
+     O = p->O;
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+	  I = O; 
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n;
+     sz->dims[i].is = sz->dims[i].os = p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = X(tensor_sz)(sz);
+     {
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft_d)(sz,
+						 X(mktensor_2d)(b, is, is,
+								p->vn, 1, 1),
+						 I2, I, p->kind + 1));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
+     sz2->dims[0] = p->sz->dims[0];
+     cld2 = X(mkplan_d)(plnr, XM(mkproblem_rdft_d)(sz2, nrest * p->vn,
+						   I, O,
+						   p->comm, p->kind,
+						   RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld2, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft_rank_geq2_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/rdft-rank1-bigvec.c
+++ b/fftw-3.3.10/mpi/rdft-rank1-bigvec.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFTs of rank == 1 when the vector length vn is >= # processes.
+   In this case, we don't need to use a six-step type algorithm, and can
+   instead transpose the RDFT dimension with the vector dimension to 
+   make the RDFT local. */
+
+#include "mpi-rdft.h"
+#include "mpi-transpose.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+     rearrangement rearrange;
+} S;
+
+typedef struct {
+     plan_mpi_rdft super;
+
+     plan *cldt_before, *cld, *cldt_after;
+     int preserve_input;
+     rearrangement rearrange;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld, *cldt_before, *cldt_after;
+     
+     /* global transpose */
+     cldt_before = (plan_rdft *) ego->cldt_before;
+     cldt_before->apply(ego->cldt_before, I, O);
+     
+     if (ego->preserve_input) I = O;
+	  
+     /* 1d RDFT(s) */
+     cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, O, I);
+     
+     /* global transpose */
+     cldt_after = (plan_rdft *) ego->cldt_after;
+     cldt_after->apply(ego->cldt_after, I, O);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->sz->rnk == 1
+	     && !(p->flags & ~RANK1_BIGVEC_ONLY)
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+
+#if 0 /* don't need this check since no other rank-1 rdft solver */
+	     && (p->vn >= n_pes /* TODO: relax this, using more memory? */
+		 || (p->flags & RANK1_BIGVEC_ONLY))
+#endif
+
+	     && XM(rearrange_applicable)(ego->rearrange,
+					 p->sz->dims[0], p->vn, n_pes)
+
+	     && (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
+                 || !XM(rdft_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cldt_before, wakefulness);
+     X(plan_awake)(ego->cld, wakefulness);
+     X(plan_awake)(ego->cldt_after, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldt_after);
+     X(plan_destroy_internal)(ego->cld);
+     X(plan_destroy_internal)(ego->cldt_before);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     const char descrip[][16] = { "contig", "discontig", "square-after",
+				  "square-middle", "square-before" };
+     p->print(p, "(mpi-rdft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
+	      descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
+	      ego->cldt_before, ego->cld, ego->cldt_after);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft *p;
+     P *pln;
+     plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
+     R *I, *O;
+     INT yblock, yb, nx, ny, vn;
+     int my_pe, n_pes;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft *) p_;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+     
+     nx = p->sz->dims[0].n;
+     if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
+	  return (plan *) 0;
+     vn = p->vn / ny;
+     A(ny * vn == p->vn);
+
+     yblock = XM(default_block)(ny, n_pes);
+     cldt_before = X(mkplan_d)(plnr,
+			       XM(mkproblem_transpose)(
+				    nx, ny, vn,
+				    I = p->I, O = p->O,
+				    p->sz->dims[0].b[IB], yblock,
+				    p->comm, 0));
+     if (XM(any_true)(!cldt_before, p->comm)) goto nada;	  
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
+     
+     yb = XM(block)(ny, yblock, my_pe);
+     cld = X(mkplan_d)(plnr,
+		       X(mkproblem_rdft_1_d)(X(mktensor_1d)(nx, vn, vn),
+					     X(mktensor_2d)(yb, vn*nx, vn*nx,
+							    vn, 1, 1),
+					     O, I, p->kind[0]));
+     if (XM(any_true)(!cld, p->comm)) goto nada;	  
+     
+     cldt_after = X(mkplan_d)(plnr,
+			      XM(mkproblem_transpose)(
+				   ny, nx, vn,
+				   I, O,
+				   yblock, p->sz->dims[0].b[OB], 
+				   p->comm, 0));
+     if (XM(any_true)(!cldt_after, p->comm)) goto nada;	  
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+
+     pln->cldt_before = cldt_before;
+     pln->cld = cld;
+     pln->cldt_after = cldt_after;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->rearrange = ego->rearrange;
+
+     X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldt_after);
+     X(plan_destroy_internal)(cld);
+     X(plan_destroy_internal)(cldt_before);
+     return (plan *) 0;
+}
+
+static solver *mksolver(rearrangement rearrange, int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->rearrange = rearrange;
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft_rank1_bigvec_register)(planner *p)
+{
+     rearrangement rearrange;
+     int preserve_input;
+     FORALL_REARRANGE(rearrange)
+	  for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	       REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
+}
--- a/fftw-3.3.10/mpi/rdft-serial.c
+++ b/fftw-3.3.10/mpi/rdft-serial.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* "MPI" RDFTs where all of the data is on one processor...just
+   call through to serial API. */
+
+#include "mpi-rdft.h"
+
+typedef struct {
+     plan_mpi_rdft super;
+     plan *cld;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld = (plan_rdft *) ego->cld;
+     cld->apply(ego->cld, I, O);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft-serial %(%p%))", ego->cld);
+}
+
+int XM(rdft_serial_applicable)(const problem_mpi_rdft *p)
+{
+     return (1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
+		 || p->vn == 0));
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     P *pln;
+     plan *cld;
+     int my_pe;
+     static const plan_adt padt = {
+          XM(rdft_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     /* check whether applicable: */
+     if (!XM(rdft_serial_applicable)(p))
+          return (plan *) 0;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     if (my_pe == 0 && p->vn > 0) {
+	  int i, rnk = p->sz->rnk;
+	  tensor *sz = X(mktensor)(rnk);
+	  rdft_kind *kind 
+	       = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
+	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = p->vn;
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
+	  for (i = rnk - 1; i > 0; --i) {
+	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
+		    sz->dims[i].is * sz->dims[i].n;
+	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
+	  }
+	  for (i = 0; i < rnk; ++i)
+	       kind[i] = p->kind[i];
+	  
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft_d)(sz,
+						X(mktensor_1d)(p->vn, 1, 1),
+						p->I, p->O, kind));
+	  X(ifree0)(kind);
+     }
+     else { /* idle process: make nop plan */
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft_0_d)(X(mktensor_1d)(0,0,0),
+						  p->I, p->O));
+     }
+     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
+
+     pln = MKPLAN_MPI_RDFT(P, &padt, apply);
+     pln->cld = cld;
+     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void XM(rdft_serial_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
--- a/fftw-3.3.10/mpi/rdft-solve.c
+++ b/fftw-3.3.10/mpi/rdft-solve.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft.h"
+
+/* use the apply() operation for MPI_RDFT problems */
+void XM(rdft_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_rdft *ego = (const plan_mpi_rdft *) ego_;
+     const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
--- a/fftw-3.3.10/mpi/rdft2-problem.c
+++ b/fftw-3.3.10/mpi/rdft2-problem.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft2.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_rdft2 *ego = (problem_mpi_rdft2 *) ego_;
+     XM(dtensor_destroy)(ego->sz);
+     MPI_Comm_free(&ego->comm);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     int i;
+     X(md5puts)(m, "mpi-rdft2");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(ialignment_of)(p->I));
+	X(md5int)(m, X(ialignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     XM(dtensor_md5)(m, p->sz);
+     X(md5INT)(m, p->vn);
+     X(md5int)(m, p->kind);
+     X(md5int)(m, p->flags);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
+     int i;
+     p->print(p, "(mpi-rdft2 %d %d %d ", 
+	      ego->I == ego->O,
+	      X(ialignment_of)(ego->I),
+	      X(ialignment_of)(ego->O));
+     XM(dtensor_print)(ego->sz, p);
+     p->print(p, " %D %d %d", ego->vn, (int) ego->kind, ego->flags);
+     MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
+     R *I = ego->I;
+     dtensor *sz;
+     INT i, N;
+     int my_pe;
+
+     sz = XM(dtensor_copy)(ego->sz);
+     sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N = 2 * ego->vn * XM(total_block)(sz, IB, my_pe);
+     XM(dtensor_destroy)(sz);
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_RDFT2,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
+			   R *I, R *O,
+			   MPI_Comm comm,
+			   rdft_kind kind,
+			   unsigned flags)
+{
+     problem_mpi_rdft2 *ego =
+          (problem_mpi_rdft2 *)X(mkproblem)(sizeof(problem_mpi_rdft2), &padt);
+     int n_pes;
+
+     A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk) && sz->rnk > 1);
+     MPI_Comm_size(comm, &n_pes);
+     A(vn >= 0);
+     A(kind == R2HC || kind == HC2R);
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->sz = XM(dtensor_canonical)(sz, 0);
+#ifdef FFTW_DEBUG
+     ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
+     A(n_pes >= XM(num_blocks_total)(ego->sz, IB)
+       && n_pes >= XM(num_blocks_total)(ego->sz, OB));
+     ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n;
+#endif
+
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     ego->kind = kind;
+
+     /* We only support TRANSPOSED_OUT for r2c and TRANSPOSED_IN for
+	c2r transforms. */
+
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
+
+problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
+			     R *I, R *O,
+			     MPI_Comm comm,
+			     rdft_kind kind,
+			     unsigned flags)
+{
+     problem *p = XM(mkproblem_rdft2)(sz, vn, I, O, comm, kind, flags);
+     XM(dtensor_destroy)(sz);
+     return p;
+}
--- a/fftw-3.3.10/mpi/rdft2-rank-geq2-transposed.c
+++ b/fftw-3.3.10/mpi/rdft2-rank-geq2-transposed.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Real-input (r2c) DFTs of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is transposed both
+   in data distribution and in ordering (for the first 2 dimensions).
+
+   Conversely, real-output (c2r) DFTs where the input is transposed.
+
+   We don't currently support transposed-input r2c or transposed-output
+   c2r transforms. */
+
+#include "mpi-rdft2.h"
+#include "mpi-transpose.h"
+#include "rdft/rdft.h"
+#include "dft/dft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft2 super;
+
+     plan *cld1, *cldt, *cld2;
+     INT vn;
+     int preserve_input;
+} P;
+
+static void apply_r2c(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_dft *cld2;
+     plan_rdft *cldt;
+     
+     /* RDFT2 local dimensions */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* DFT final local dimension */
+     cld2 = (plan_dft *) ego->cld2;
+     cld2->apply(ego->cld2, O, O+1, O, O+1);
+}
+
+static void apply_c2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_dft *cld2;
+     plan_rdft *cldt;
+     
+     /* IDFT local dimensions */
+     cld2 = (plan_dft *) ego->cld2;
+     if (ego->preserve_input) {
+	  cld2->apply(ego->cld2, I+1, I, O+1, O);
+	  I = O;
+     }
+     else
+	  cld2->apply(ego->cld2, I+1, I, I+1, I);
+
+     /* global transpose */
+     cldt = (plan_rdft *) ego->cldt;
+     cldt->apply(ego->cldt, I, O);
+
+     /* RDFT2 final local dimension */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && ((p->flags == TRANSPOSED_OUT && p->kind == R2HC
+		  && XM(is_local_after)(1, p->sz, IB)
+		  && XM(is_local_after)(2, p->sz, OB)
+		  && XM(num_blocks)(p->sz->dims[0].n, 
+				    p->sz->dims[0].b[OB]) == 1)
+		 || 
+		 (p->flags == TRANSPOSED_IN && p->kind == HC2R
+		  && XM(is_local_after)(1, p->sz, OB)
+		  && XM(is_local_after)(2, p->sz, IB)
+		  && XM(num_blocks)(p->sz->dims[0].n, 
+				    p->sz->dims[0].b[IB]) == 1))
+	     && (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
+		 || !XM(rdft2_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldt, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cldt);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft2-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldt, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft2 *p;
+     P *pln;
+     plan *cld1 = 0, *cldt = 0, *cld2 = 0;
+     R *r0, *r1, *cr, *ci, *ri, *ii, *ro, *io, *I, *O;
+     tensor *sz;
+     int i, my_pe, n_pes;
+     INT nrest, n1, b1;
+     static const plan_adt padt = {
+          XM(rdft2_solve), awake, print, destroy
+     };
+     block_kind k1, k2;
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft2 *) p_;
+
+     I = p->I; O = p->O;
+     if (p->kind == R2HC) {
+	  k1 = IB; k2 = OB;
+          r1 = (r0 = I) + p->vn;
+	  if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
+	       ci = (cr = O) + 1;
+	       I = O; 
+	  }
+	  else 
+	       ci = (cr = I) + 1;
+	  io = ii = (ro = ri = O) + 1;
+     }
+     else {
+	  k1 = OB; k2 = IB;
+	  r1 = (r0 = O) + p->vn;
+	  ci = (cr = O) + 1;
+	  if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
+	       ri = (ii = I) + 1;
+	       ro = (io = O) + 1;
+	       I = O;
+	  }
+	  else
+	       ro = ri = (io = ii = I) + 1;
+     }
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
+     {
+	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[k1], my_pe);
+	  sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft2_d)(sz,
+						  X(mktensor_2d)(b, is, is,
+								p->vn,ivs,ovs),
+						  r0, r1, cr, ci, p->kind));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     nrest *= p->vn;
+     n1 = p->sz->dims[1].n;
+     b1 = p->sz->dims[1].b[k2];
+     if (p->sz->rnk == 2) { /* n1 dimension is cut in ~half */
+	  n1 = n1 / 2 + 1;
+	  b1 = b1 == p->sz->dims[1].n ? n1 : b1;
+     }
+
+     if (p->kind == R2HC)
+	  cldt = X(mkplan_d)(plnr,
+			     XM(mkproblem_transpose)(
+				  p->sz->dims[0].n, n1, nrest * 2,
+				  I, O,
+				  p->sz->dims[0].b[IB], b1,
+				  p->comm, 0));
+     else
+	  cldt = X(mkplan_d)(plnr,
+			     XM(mkproblem_transpose)(
+				  n1, p->sz->dims[0].n, nrest * 2,
+				  I, O,
+				  b1, p->sz->dims[0].b[OB], 
+				  p->comm, 0));
+     if (XM(any_true)(!cldt, p->comm)) goto nada;
+
+     {
+	  INT is = p->sz->dims[0].n * nrest * 2;
+	  INT b = XM(block)(n1, b1, my_pe);
+	  cld2 = X(mkplan_d)(plnr,
+			     X(mkproblem_dft_d)(X(mktensor_1d)(
+						     p->sz->dims[0].n,
+						     nrest * 2, nrest * 2),
+						X(mktensor_2d)(b, is, is,
+							       nrest, 2, 2),
+						ri, ii, ro, io));
+	  if (XM(any_true)(!cld2, p->comm)) goto nada;
+     }
+
+     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
+     pln->cld1 = cld1;
+     pln->cldt = cldt;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->vn = p->vn;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+     X(ops_add2)(&cldt->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cldt);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft2_rank_geq2_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/rdft2-rank-geq2.c
+++ b/fftw-3.3.10/mpi/rdft2-rank-geq2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Complex RDFT2s of rank >= 2, for the case where we are distributed
+   across the first dimension only, and the output is not transposed. */
+
+#include "mpi-dft.h"
+#include "mpi-rdft2.h"
+#include "rdft/rdft.h"
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_rdft2 super;
+
+     plan *cld1, *cld2;
+     INT vn;
+     int preserve_input;
+} P;
+
+static void apply_r2c(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_rdft *cld2;
+     
+     /* RDFT2 local dimensions */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     if (ego->preserve_input) {
+	  cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
+	  I = O;
+     }
+     else
+	  cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
+
+     /* DFT non-local dimension (via dft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+}
+
+static void apply_c2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld1;
+     plan_rdft *cld2;
+     
+     /* DFT non-local dimension (via dft-rank1-bigvec, usually): */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+
+     /* RDFT2 local dimensions */
+     cld1 = (plan_rdft2 *) ego->cld1;
+     cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
+
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     return (1
+	     && p->sz->rnk > 1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O
+					  && p->kind == R2HC))
+	     && XM(is_local_after)(1, p->sz, IB)
+	     && XM(is_local_after)(1, p->sz, OB)
+	     && (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
+		 || !XM(rdft2_serial_applicable)(p))
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft2-rank-geq2%s%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_rdft2 *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0;
+     R *r0, *r1, *cr, *ci, *I, *O;
+     tensor *sz;
+     dtensor *sz2;
+     int i, my_pe, n_pes;
+     INT nrest;
+     static const plan_adt padt = {
+          XM(rdft2_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_rdft2 *) p_;
+
+     I = p->I; O = p->O;
+     if (p->kind == R2HC) {
+          r1 = (r0 = p->I) + p->vn;
+	  if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
+	       ci = (cr = p->O) + 1;
+	       I = O; 
+	  }
+	  else 
+	       ci = (cr = p->I) + 1;
+     }
+     else {
+          r1 = (r0 = p->O) + p->vn;
+          ci = (cr = p->O) + 1;
+     }
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
+     i = p->sz->rnk - 2; A(i >= 0);
+     sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
+     sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
+     for (--i; i >= 0; --i) {
+	  sz->dims[i].n = p->sz->dims[i+1].n;
+	  sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
+     }
+     nrest = X(tensor_sz)(sz);
+     {
+	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
+          INT is = sz->dims[0].n * sz->dims[0].is;
+          INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
+	  sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
+	  cld1 = X(mkplan_d)(plnr,
+                             X(mkproblem_rdft2_d)(sz,
+						  X(mktensor_2d)(b, is, is,
+							        p->vn,ivs,ovs),
+						  r0, r1, cr, ci, p->kind));
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
+     sz2->dims[0] = p->sz->dims[0];
+     cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
+						  I, O, p->comm, 
+						  p->kind == R2HC ?
+						  FFT_SIGN : -FFT_SIGN,
+						  RANK1_BIGVEC_ONLY));
+     if (XM(any_true)(!cld2, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->vn = p->vn;
+
+     X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(rdft2_rank_geq2_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/rdft2-serial.c
+++ b/fftw-3.3.10/mpi/rdft2-serial.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* "MPI" DFTs where all of the data is on one processor...just
+   call through to serial API. */
+
+#include "mpi-rdft2.h"
+#include "rdft/rdft.h"
+
+typedef struct {
+     plan_mpi_rdft2 super;
+     plan *cld;
+     INT vn;
+} P;
+
+static void apply_r2c(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld;
+     cld = (plan_rdft2 *) ego->cld;
+     cld->apply(ego->cld, I, I+ego->vn, O, O+1);
+}
+
+static void apply_c2r(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft2 *cld;
+     cld = (plan_rdft2 *) ego->cld;
+     cld->apply(ego->cld, O, O+ego->vn, I, I+1);
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cld);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-rdft2-serial %(%p%))", ego->cld);
+}
+
+int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p)
+{
+     return (1
+	     && p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
+	     && ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
+		 || p->vn == 0));
+}
+
+static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
+{
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     P *pln;
+     plan *cld;
+     int my_pe;
+     R *r0, *r1, *cr, *ci;
+     static const plan_adt padt = {
+          XM(rdft2_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     /* check whether applicable: */
+     if (!XM(rdft2_serial_applicable)(p))
+          return (plan *) 0;
+
+     if (p->kind == R2HC) {
+	  r1 = (r0 = p->I) + p->vn;
+	  ci = (cr = p->O) + 1;
+     }
+     else {
+	  r1 = (r0 = p->O) + p->vn;
+	  ci = (cr = p->I) + 1;
+     }
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     if (my_pe == 0 && p->vn > 0) {
+	  INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
+	  int i, rnk = p->sz->rnk;
+	  tensor *sz = X(mktensor)(p->sz->rnk);
+	  sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n / 2 + 1;
+	  for (i = rnk - 1; i > 0; --i) {
+	       sz->dims[i - 1].is = sz->dims[i - 1].os = 
+		    sz->dims[i].is * sz->dims[i].n;
+	       sz->dims[i - 1].n = p->sz->dims[i - 1].n;
+	  }
+	  sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
+
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft2_d)(sz,
+						 X(mktensor_1d)(p->vn,ivs,ovs),
+						 r0, r1, cr, ci, p->kind));
+     }
+     else { /* idle process: make nop plan */
+	  cld = X(mkplan_d)(plnr,
+			    X(mkproblem_rdft2_d)(X(mktensor_0d)(),
+						 X(mktensor_1d)(0,0,0),
+						 cr, ci, cr, ci, HC2R));
+     }
+     if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
+
+     pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
+     pln->cld = cld;
+     pln->vn = p->vn;
+     X(ops_cpy)(&cld->ops, &pln->super.super.ops);
+     return &(pln->super.super);
+}
+
+static solver *mksolver(void)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
+     return MKSOLVER(solver, &sadt);
+}
+
+void XM(rdft2_serial_register)(planner *p)
+{
+     REGISTER_SOLVER(p, mksolver());
+}
--- a/fftw-3.3.10/mpi/rdft2-solve.c
+++ b/fftw-3.3.10/mpi/rdft2-solve.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-rdft2.h"
+
+/* use the apply() operation for MPI_RDFT2 problems */
+void XM(rdft2_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_rdft2 *ego = (const plan_mpi_rdft2 *) ego_;
+     const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
--- a/fftw-3.3.10/mpi/rearrange.c
+++ b/fftw-3.3.10/mpi/rearrange.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "ifftw-mpi.h"
+
+/* common functions for rearrangements of the data for the *-rank1-bigvec
+   solvers */
+
+static int div_mult(INT b, INT a) { 
+     return (a > b && a % b == 0);
+}
+static int div_mult2(INT b, INT a, INT n) { 
+     return (div_mult(b, a) && div_mult(n, b));
+}
+
+int XM(rearrange_applicable)(rearrangement rearrange, 
+			     ddim dim0, INT vn, int n_pes)
+{
+     /* note: it is important that cases other than CONTIG be
+	applicable only when the resulting transpose dimension
+	is divisible by n_pes; otherwise, the allocation size
+	returned by the API will be incorrect */
+     return ((rearrange != DISCONTIG || div_mult(n_pes, vn))
+	     && (rearrange != SQUARE_BEFORE 
+		 || div_mult2(dim0.b[IB], vn, n_pes))
+	     && (rearrange != SQUARE_AFTER
+		 || (dim0.b[IB] != dim0.b[OB]
+		     && div_mult2(dim0.b[OB], vn, n_pes)))
+	     && (rearrange != SQUARE_MIDDLE
+		 || div_mult(dim0.n * n_pes, vn)));
+}
+
+INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes)
+{
+     switch (rearrange) {
+	 case CONTIG:
+	      return vn;
+	 case DISCONTIG:
+	      return n_pes;
+	 case SQUARE_BEFORE:
+	      return dim0.b[IB];
+	 case SQUARE_AFTER:
+	      return dim0.b[OB];
+	 case SQUARE_MIDDLE:
+	      return dim0.n * n_pes;
+     }
+     return 0;
+}
--- a/fftw-3.3.10/mpi/testsched.c
+++ b/fftw-3.3.10/mpi/testsched.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/**********************************************************************/
+/* This is a modified and combined version of the sched.c and
+   test_sched.c files shipped with FFTW 2, written to implement and
+   test various all-to-all communications scheduling patterns.
+
+   It is not used in FFTW 3, but I keep it around in case we ever want
+   to play with this again or to change algorithms.  In particular, I
+   used it to implement and test the fill1_comm_sched routine in
+   transpose-pairwise.c, which allows us to create a schedule for one
+   process at a time and is much more compact than the FFTW 2 code.
+
+   Note that the scheduling algorithm is somewhat modified from that
+   of FFTW 2.  Originally, I thought that one "stall" in the schedule
+   was unavoidable for odd numbers of processes, since this is the
+   case for the soccer-timetabling problem.  However, because of the
+   self-communication step, we can use the self-communication to fill
+   in the stalls.  (Thanks to Ralf Wildenhues for pointing this out.)
+   This greatly simplifies the process re-sorting algorithm. */
+
+/**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/* This file contains routines to compute communications schedules for
+   all-to-all communications (complete exchanges) that are performed
+   in-place.  (That is, the block that processor x sends to processor
+   y gets replaced on processor x by a block received from processor y.)
+
+   A schedule, int **sched, is a two-dimensional array where
+   sched[pe][i] is the processor that pe expects to exchange a message
+   with on the i-th step of the exchange.  sched[pe][i] == -1 for the
+   i after the last exchange scheduled on pe.
+
+   Here, processors (pe's, for processing elements), are numbered from
+   0 to npes-1.
+
+   There are a couple of constraints that a schedule should satisfy
+   (besides the obvious one that every processor has to communicate
+   with every other processor exactly once).
+   
+   * First, and most importantly, there must be no deadlocks.
+   
+   * Second, we would like to overlap communications as much as possible,
+   so that all exchanges occur in parallel.  It turns out that perfect
+   overlap is possible for all number of processes (npes).
+
+   It turns out that this scheduling problem is actually well-studied,
+   and good solutions are known.  The problem is known as a
+   "time-tabling" problem, and is specifically the problem of
+   scheduling a sports competition (where n teams must compete exactly
+   once with every other team).  The problem is discussed and
+   algorithms are presented in:
+
+   [1] J. A. M. Schreuder, "Constructing Timetables for Sport
+   Competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
+
+   [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint
+   Logic Programming," Proc. of 12th Europ. Conf. on
+   Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996).
+   http://hermes.dis.uniromal.it/~aschaerf/publications.html
+
+   (These people actually impose a lot of additional constraints that
+   we don't care about, so they are solving harder problems. [1] gives
+   a simple enough algorithm for our purposes, though.)
+   
+   In the timetabling problem, N teams can all play one another in N-1
+   steps if N is even, and N steps if N is odd.  Here, however,
+   there is a "self-communication" step (a team must also "play itself")
+   and so we can always make an optimal N-step schedule regardless of N.
+
+   However, we have to do more: for a particular processor, the
+   communications schedule must be sorted in ascending or descending
+   order of processor index.  (This is necessary so that the data
+   coming in for the transpose does not overwrite data that will be
+   sent later; for that processor the incoming and outgoing blocks are
+   of different non-zero sizes.)  Fortunately, because the schedule
+   is stall free, each parallel step of the schedule is independent
+   of every other step, and we can reorder the steps arbitrarily
+   to achieve any desired order on a particular process.
+*/
+
+void free_comm_schedule(int **sched, int npes)
+{
+     if (sched) {
+	  int i;
+
+	  for (i = 0; i < npes; ++i)
+	       free(sched[i]);
+	  free(sched);
+     }
+}
+
+void empty_comm_schedule(int **sched, int npes)
+{
+     int i;
+     for (i = 0; i < npes; ++i)
+	  sched[i][0] = -1;
+}
+
+extern void fill_comm_schedule(int **sched, int npes);
+
+/* Create a new communications schedule for a given number of processors.
+   The schedule is initialized to a deadlock-free, maximum overlap
+   schedule.  Returns NULL on an error (may print a message to
+   stderr if there is a program bug detected).  */
+int **make_comm_schedule(int npes)
+{
+     int **sched;
+     int i;
+
+     sched = (int **) malloc(sizeof(int *) * npes);
+     if (!sched)
+	  return NULL;
+
+     for (i = 0; i < npes; ++i)
+	  sched[i] = NULL;
+
+     for (i = 0; i < npes; ++i) {
+	  sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1));
+	  if (!sched[i]) {
+	       free_comm_schedule(sched,npes);
+	       return NULL;
+	  }
+     }
+     
+     empty_comm_schedule(sched,npes);
+     fill_comm_schedule(sched,npes);
+
+     if (!check_comm_schedule(sched,npes)) {
+	  free_comm_schedule(sched,npes);
+	  return NULL;
+     }
+
+     return sched;
+}
+
+static void add_dest_to_comm_schedule(int **sched, int pe, int dest)
+{
+     int i;
+     
+     for (i = 0; sched[pe][i] != -1; ++i)
+	  ;
+
+     sched[pe][i] = dest;
+     sched[pe][i+1] = -1;
+}
+
+static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2)
+{
+     add_dest_to_comm_schedule(sched, pe1, pe2);
+     if (pe1 != pe2)
+	  add_dest_to_comm_schedule(sched, pe2, pe1);
+}
+
+/* Simplification of algorithm presented in [1] (we have fewer
+   constraints).  Produces a perfect schedule (npes steps).  */
+
+void fill_comm_schedule(int **sched, int npes)
+{
+     int pe, i, n;
+
+     if (npes % 2 == 0) {
+	  n = npes;
+	  for (pe = 0; pe < npes; ++pe)
+	       add_pair_to_comm_schedule(sched,pe,pe);
+     }
+     else
+	  n = npes + 1;
+
+     for (pe = 0; pe < n - 1; ++pe) {
+	  add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe);
+	  
+	  for (i = 1; i < n/2; ++i) {
+	       int pe_a, pe_b;
+
+	       pe_a = pe - i;
+	       if (pe_a < 0)
+		    pe_a += n - 1;
+
+	       pe_b = (pe + i) % (n - 1);
+
+	       add_pair_to_comm_schedule(sched,pe_a,pe_b);
+	  }
+     }
+}
+
+/* given an array sched[npes], fills it with the communications
+   schedule for process pe. */
+void fill1_comm_sched(int *sched, int which_pe, int npes)
+{
+     int pe, i, n, s = 0;
+     if (npes % 2 == 0) {
+	  n = npes;
+	  sched[s++] = which_pe;
+     }
+     else
+	  n = npes + 1;
+     for (pe = 0; pe < n - 1; ++pe) {
+	  if (npes % 2 == 0) {
+	       if (pe == which_pe) sched[s++] = npes - 1;
+	       else if (npes - 1 == which_pe) sched[s++] = pe;
+	  }
+	  else if (pe == which_pe) sched[s++] = pe;
+
+	  if (pe != which_pe && which_pe < n - 1) {
+	       i = (pe - which_pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe + i) % (n - 1);
+	       
+	       i = (which_pe - pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
+	  }
+     }
+     if (s != npes) {
+	  fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n", 
+		  s, which_pe, npes);
+	  exit(EXIT_FAILURE);
+     }
+}
+
+/* sort the communication schedule sched for npes so that the schedule
+   on process sortpe is ascending or descending (!ascending). */
+static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
+{
+     int *sortsched, i;
+     sortsched = (int *) malloc(npes * sizeof(int) * 2);
+     fill1_comm_sched(sortsched, sortpe, npes);
+     if (ascending)
+          for (i = 0; i < npes; ++i)
+               sortsched[npes + sortsched[i]] = sched[i];
+     else
+          for (i = 0; i < npes; ++i)
+               sortsched[2*npes - 1 - sortsched[i]] = sched[i];
+     for (i = 0; i < npes; ++i)
+          sched[i] = sortsched[npes + i];
+     free(sortsched);
+}
+
+/* Below, we have various checks in case of bugs: */
+
+/* check for deadlocks by simulating the schedule and looking for
+   cycles in the dependency list; returns 0 if there are deadlocks
+   (or other errors) */
+static int check_schedule_deadlock(int **sched, int npes)
+{
+     int *step, *depend, *visited, pe, pe2, period, done = 0;
+     int counter = 0;
+
+     /* step[pe] is the step in the schedule that a given pe is on */
+     step = (int *) malloc(sizeof(int) * npes);
+
+     /* depend[pe] is the pe' that pe is currently waiting for a message
+	from (-1 if none) */
+     depend = (int *) malloc(sizeof(int) * npes);
+
+     /* visited[pe] tells whether we have visited the current pe already
+	when we are looking for cycles. */
+     visited = (int *) malloc(sizeof(int) * npes);
+
+     if (!step || !depend || !visited) {
+	  free(step); free(depend); free(visited);
+	  return 0;
+     }
+
+     for (pe = 0; pe < npes; ++pe)
+	  step[pe] = 0;
+
+     while (!done) {
+	  ++counter;
+
+	  for (pe = 0; pe < npes; ++pe)
+	       depend[pe] = sched[pe][step[pe]];
+	  
+	  /* now look for cycles in the dependencies with period > 2: */
+	  for (pe = 0; pe < npes; ++pe)
+	       if (depend[pe] != -1) {
+		    for (pe2 = 0; pe2 < npes; ++pe2)
+			 visited[pe2] = 0;
+
+		    period = 0;
+		    pe2 = pe;
+		    do {
+			 visited[pe2] = period + 1;
+			 pe2 = depend[pe2];
+			 period++;
+		    } while (pe2 != -1 && !visited[pe2]);
+
+		    if (pe2 == -1) {
+			 fprintf(stderr,
+				 "BUG: unterminated cycle in schedule!\n");
+			 free(step); free(depend);
+			 free(visited);
+			 return 0;
+		    }
+		    if (period - (visited[pe2] - 1) > 2) {
+			 fprintf(stderr,"BUG: deadlock in schedule!\n");
+			 free(step); free(depend);
+			 free(visited);
+			 return 0;
+		    }
+
+		    if (pe2 == pe)
+			 step[pe]++;
+	       }
+
+	  done = 1;
+	  for (pe = 0; pe < npes; ++pe)
+	       if (sched[pe][step[pe]] != -1) {
+		    done = 0;
+		    break;
+	       }
+     }
+
+     free(step); free(depend); free(visited);
+     return (counter > 0 ? counter : 1);
+}
+
+/* sanity checks; prints message and returns 0 on failure.
+   undocumented feature: the return value on success is actually the
+   number of steps required for the schedule to complete, counting
+   stalls. */
+int check_comm_schedule(int **sched, int npes)
+{
+     int pe, i, comm_pe;
+     
+     for (pe = 0; pe < npes; ++pe) {
+	  for (comm_pe = 0; comm_pe < npes; ++comm_pe) {
+	       for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i)
+		    ;
+	       if (sched[pe][i] == -1) {
+		    fprintf(stderr,"BUG: schedule never sends message from "
+			    "%d to %d.\n",pe,comm_pe);
+		    return 0;  /* never send message to comm_pe */
+	       }
+	  }
+	  for (i = 0; sched[pe][i] != -1; ++i)
+	       ;
+	  if (i != npes) {
+	       fprintf(stderr,"BUG: schedule sends too many messages from "
+		       "%d\n",pe);
+	       return 0;
+	  }
+     }
+     return check_schedule_deadlock(sched,npes);
+}
+
+/* invert the order of all the schedules; this has no effect on
+   its required properties. */
+void invert_comm_schedule(int **sched, int npes)
+{
+     int pe, i;
+
+     for (pe = 0; pe < npes; ++pe)
+	  for (i = 0; i < npes/2; ++i) {
+	       int dummy = sched[pe][i];
+	       sched[pe][i] = sched[pe][npes-1-i];
+	       sched[pe][npes-1-i] = dummy;
+	  }
+}
+
+/* Sort the schedule for sort_pe in ascending order of processor
+   index.  Unfortunately, for odd npes (when schedule has a stall
+   to begin with) this will introduce an extra stall due to
+   the motion of the self-communication past a stall.  We could
+   fix this if it were really important.  Actually, we don't
+   get an extra stall when sort_pe == 0 or npes-1, which is sufficient
+   for our purposes. */
+void sort_comm_schedule(int **sched, int npes, int sort_pe)
+{
+     int i,j,pe;
+
+     /* Note that we can do this sort in O(npes) swaps because we know
+	that the numbers we are sorting are just 0...npes-1.   But we'll
+	just do a bubble sort for simplicity here. */
+
+     for (i = 0; i < npes - 1; ++i)
+	  for (j = i + 1; j < npes; ++j)
+	       if (sched[sort_pe][i] > sched[sort_pe][j]) {
+		    for (pe = 0; pe < npes; ++pe) {
+			 int s = sched[pe][i];
+			 sched[pe][i] = sched[pe][j];
+			 sched[pe][j] = s;
+		    }
+	       }
+}
+
+/* print the schedule (for debugging purposes) */
+void print_comm_schedule(int **sched, int npes)
+{
+     int pe, i, width;
+
+     if (npes < 10)
+	  width = 1;
+     else if (npes < 100)
+	  width = 2;
+     else
+	  width = 3;
+
+     for (pe = 0; pe < npes; ++pe) {
+	  printf("pe %*d schedule:", width, pe);
+	  for (i = 0; sched[pe][i] != -1; ++i)
+	       printf("  %*d",width,sched[pe][i]);
+	  printf("\n");
+     }
+}
+
+int main(int argc, char **argv)
+{
+     int **sched;
+     int npes = -1, sortpe = -1, steps, i;
+
+     if (argc >= 2) {
+	  npes = atoi(argv[1]);
+	  if (npes <= 0) {
+	       fprintf(stderr,"npes must be positive!");
+	       return 1;
+	  }
+     }
+     if (argc >= 3) {
+	  sortpe = atoi(argv[2]);
+	  if (sortpe < 0 || sortpe >= npes) {
+	       fprintf(stderr,"sortpe must be between 0 and npes-1.\n");
+	       return 1;
+	  }
+     }
+
+     if (npes != -1) {
+	  printf("Computing schedule for npes = %d:\n",npes);
+	  sched = make_comm_schedule(npes);
+	  if (!sched) {
+	       fprintf(stderr,"Out of memory!");
+	       return 6;
+	  }
+	  
+	  if (steps = check_comm_schedule(sched,npes))
+	       printf("schedule OK (takes %d steps to complete).\n", steps);
+	  else
+	       printf("schedule not OK.\n");
+
+	  print_comm_schedule(sched, npes);
+	  
+	  if (sortpe != -1) {
+	       printf("\nRe-creating schedule for pe = %d...\n", sortpe);
+	       int *sched1 = (int*) malloc(sizeof(int) * npes);
+	       for (i = 0; i < npes; ++i) sched1[i] = -1;
+	       fill1_comm_sched(sched1, sortpe, npes);
+	       printf("  =");
+	       for (i = 0; i < npes; ++i) 
+		    printf("  %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3),
+			   sched1[i]);
+	       printf("\n");
+
+	       printf("\nSorting schedule for sortpe = %d...\n", sortpe);
+	       sort_comm_schedule(sched,npes,sortpe);
+	       
+	       if (steps = check_comm_schedule(sched,npes))
+		    printf("schedule OK (takes %d steps to complete).\n", 
+			   steps);
+	       else
+		    printf("schedule not OK.\n");
+
+	       print_comm_schedule(sched, npes);
+
+	       printf("\nInverting schedule...\n");
+	       invert_comm_schedule(sched,npes);
+	       
+	       if (steps = check_comm_schedule(sched,npes))
+		    printf("schedule OK (takes %d steps to complete).\n", 
+			   steps);
+	       else
+		    printf("schedule not OK.\n");
+
+	       print_comm_schedule(sched, npes);
+	       
+	       free_comm_schedule(sched,npes);
+
+	       free(sched1);
+	  }
+     }
+     else {
+	  printf("Doing infinite tests...\n");
+	  for (npes = 1; ; ++npes) {
+	       int *sched1 = (int*) malloc(sizeof(int) * npes);
+	       printf("npes = %d...",npes);
+	       sched = make_comm_schedule(npes);
+	       if (!sched) {
+		    fprintf(stderr,"Out of memory!\n");
+		    return 5;
+	       }
+	       for (sortpe = 0; sortpe < npes; ++sortpe) {
+		    empty_comm_schedule(sched,npes);
+		    fill_comm_schedule(sched,npes);
+		    if (!check_comm_schedule(sched,npes)) {
+			 fprintf(stderr,
+				 "\n -- fill error for sortpe = %d!\n",sortpe);
+			 return 2;
+		    }
+
+		    for (i = 0; i < npes; ++i) sched1[i] = -1;
+		    fill1_comm_sched(sched1, sortpe, npes);
+		    for (i = 0; i < npes; ++i)
+			 if (sched1[i] != sched[sortpe][i])
+			      fprintf(stderr,
+				      "\n -- fill1 error for pe = %d!\n",
+				      sortpe);
+
+		    sort_comm_schedule(sched,npes,sortpe);
+		    if (!check_comm_schedule(sched,npes)) {
+			 fprintf(stderr,
+				 "\n -- sort error for sortpe = %d!\n",sortpe);
+			 return 3;
+		    }
+		    invert_comm_schedule(sched,npes);
+		    if (!check_comm_schedule(sched,npes)) {
+			 fprintf(stderr,
+				 "\n -- invert error for sortpe = %d!\n",
+				 sortpe);
+			 return 4;
+		    }
+	       }
+	       free_comm_schedule(sched,npes);
+	       printf("OK\n");
+	       if (npes % 50 == 0)
+		    printf("(...Hit Ctrl-C to stop...)\n");
+	       free(sched1);
+	  }
+     }
+
+     return 0;
+}
--- a/fftw-3.3.10/mpi/transpose-alltoall.c
+++ b/fftw-3.3.10/mpi/transpose-alltoall.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* plans for distributed out-of-place transpose using MPI_Alltoall,
+   and which destroy the input array (unless TRANSPOSED_IN is used) */
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int copy_transposed_in; /* whether to copy the input for TRANSPOSED_IN,
+				which makes the final transpose out-of-place
+				but costs an extra copy and requires us
+				to destroy the input */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+
+     MPI_Comm comm;
+     int *send_block_sizes, *send_block_offsets;
+     int *recv_block_sizes, *recv_block_offsets;
+
+     INT rest_Ioff, rest_Ooff;
+
+     int equal_blocks;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cld2rest, *cld3;
+
+     /* transpose locally to get contiguous chunks */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) {
+	  cld1->apply(ego->cld1, I, O);
+	  
+	  /* transpose chunks globally */
+	  if (ego->equal_blocks)
+	       MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
+			    I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
+			    ego->comm);
+	  else
+	       MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
+			     FFTW_MPI_TYPE,
+			     I, ego->recv_block_sizes, ego->recv_block_offsets,
+			     FFTW_MPI_TYPE,
+			     ego->comm);
+     }
+     else { /* TRANSPOSED_IN, no need to destroy input */
+	  /* transpose chunks globally */
+	  if (ego->equal_blocks)
+	       MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
+			    O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
+			    ego->comm);
+	  else
+	       MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
+			     FFTW_MPI_TYPE,
+			     O, ego->recv_block_sizes, ego->recv_block_offsets,
+			     FFTW_MPI_TYPE,
+			     ego->comm);
+	  I = O; /* final transpose (if any) is in-place */
+     }
+     
+     /* transpose locally, again, to get ordinary row-major */
+     cld2 = (plan_rdft *) ego->cld2;
+     if (cld2) {
+	  cld2->apply(ego->cld2, I, O);
+	  cld2rest = (plan_rdft *) ego->cld2rest;
+	  if (cld2rest) { /* leftover from unequal block sizes */
+	       cld2rest->apply(ego->cld2rest,
+			       I + ego->rest_Ioff, O + ego->rest_Ooff);
+	       cld3 = (plan_rdft *) ego->cld3;
+	       if (cld3)
+		    cld3->apply(ego->cld3, O, O);
+	       /* else TRANSPOSED_OUT is true and user wants O transposed */
+	  }
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     return (1
+	     && p->I != p->O
+	     && (!NO_DESTROY_INPUTP(plnr) || 
+		 ((p->flags & TRANSPOSED_IN) && !ego->copy_transposed_in))
+	     && ((p->flags & TRANSPOSED_IN) || !ego->copy_transposed_in)
+	     && ONLY_TRANSPOSEDP(p->flags)
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-alltoall%s%(%p%)%(%p%)%(%p%)%(%p%))",
+	      ego->equal_blocks ? "/e" : "",
+	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     R *I;
+     int *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes;
+     int equal_blocks = 1;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     b = XM(block)(p->nx, p->block, my_pe);
+
+     if (p->flags & TRANSPOSED_IN) { /* I is already transposed */
+	  if (ego->copy_transposed_in) {
+	       cld1 = X(mkplan_f_d)(plnr,
+				  X(mkproblem_rdft_0_d)(X(mktensor_1d)
+							(b * p->ny * vn, 1, 1),
+							I = p->I, p->O),
+				    0, 0, NO_SLOW);
+	       if (XM(any_true)(!cld1, p->comm)) goto nada;
+	  }
+	  else
+	       I = p->O; /* final transpose is in-place */
+     }
+     else { /* transpose b x ny x vn -> ny x b x vn */
+	  cld1 = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (b, p->ny * vn, vn,
+						      p->ny, vn, b * vn,
+						      vn, 1, 1),
+						     I = p->I, p->O),
+			       0, 0, NO_SLOW);
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+	  
+     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, p->O, my_pe,
+						 &cld2, &cld2rest, &cld3,
+						 &rest_Ioff, &rest_Ooff),
+		      p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     /* Compute sizes/offsets of blocks to send for all-to-all command. */
+     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT db, dbt; /* destination block sizes */
+	  db = XM(block)(p->nx, p->block, pe);
+	  dbt = XM(block)(p->ny, p->tblock, pe);
+	  if (db != p->block || dbt != p->tblock)
+	       equal_blocks = 0;
+
+	  /* MPI requires type "int" here; apparently it
+	     has no 64-bit API?  Grrr. */
+	  sbs[pe] = (int) (b * dbt * vn);
+	  sbo[pe] = (int) (pe * (b * p->tblock) * vn);
+	  rbs[pe] = (int) (db * bt * vn);
+	  rbo[pe] = (int) (pe * (p->block * bt) * vn);
+     }
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+     pln->equal_blocks = equal_blocks;
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int copy_transposed_in)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->copy_transposed_in = copy_transposed_in;
+     return &(slv->super);
+}
+
+void XM(transpose_alltoall_register)(planner *p)
+{
+     int cti;
+     for (cti = 0; cti <= 1; ++cti)
+	  REGISTER_SOLVER(p, mksolver(cti));
+}
--- a/fftw-3.3.10/mpi/transpose-pairwise.c
+++ b/fftw-3.3.10/mpi/transpose-pairwise.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Distributed transposes using a sequence of carefully scheduled
+   pairwise exchanges.  This has the advantage that it can be done
+   in-place, or out-of-place while preserving the input, using buffer
+   space proportional to the local size divided by the number of
+   processes (i.e. to the total array size divided by the number of
+   processes squared). */
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+     INT rest_Ioff, rest_Ooff;
+     
+     int n_pes, my_pe, *sched;
+     INT *send_block_sizes, *send_block_offsets;
+     INT *recv_block_sizes, *recv_block_offsets;
+     MPI_Comm comm;
+     int preserve_input;
+} P;
+
+static void transpose_chunks(int *sched, int n_pes, int my_pe,
+			     INT *sbs, INT *sbo, INT *rbs, INT *rbo,
+			     MPI_Comm comm,
+			     R *I, R *O)
+{
+     if (sched) {
+	  int i;
+	  MPI_Status status;
+
+	  /* TODO: explore non-synchronous send/recv? */
+
+	  if (I == O) {
+	       R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
+	       
+	       for (i = 0; i < n_pes; ++i) {
+		    int pe = sched[i];
+		    if (my_pe == pe) {
+			 if (rbo[pe] != sbo[pe])
+			      memmove(O + rbo[pe], O + sbo[pe],
+				      sbs[pe] * sizeof(R));
+		    }
+		    else {
+			 memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
+			 MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
+				      pe, (my_pe * n_pes + pe) & 0x7fff,
+				      O + rbo[pe], (int) (rbs[pe]),
+				      FFTW_MPI_TYPE,
+				      pe, (pe * n_pes + my_pe) & 0x7fff,
+				      comm, &status);
+		    }
+	       }
+
+	       X(ifree)(buf);
+	  }
+	  else { /* I != O */
+	       for (i = 0; i < n_pes; ++i) {
+		    int pe = sched[i];
+		    if (my_pe == pe)
+			 memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
+		    else
+			 MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
+				      FFTW_MPI_TYPE,
+				      pe, (my_pe * n_pes + pe) & 0x7fff,
+				      O + rbo[pe], (int) (rbs[pe]),
+				      FFTW_MPI_TYPE,
+				      pe, (pe * n_pes + my_pe) & 0x7fff,
+				      comm, &status);
+	       }
+	  }
+     }
+}
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cld2, *cld2rest, *cld3;
+
+     /* transpose locally to get contiguous chunks */
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) {
+	  cld1->apply(ego->cld1, I, O);
+	  
+	  if (ego->preserve_input) I = O;
+
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, O, I);
+     }
+     else if (ego->preserve_input) {
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, I, O);
+
+	  I = O;
+     }
+     else {
+	  /* transpose chunks globally */
+	  transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+			   ego->send_block_sizes, ego->send_block_offsets,
+			   ego->recv_block_sizes, ego->recv_block_offsets,
+			   ego->comm, I, I);
+     }
+
+     /* transpose locally, again, to get ordinary row-major;
+	this may take two transposes if the block sizes are unequal
+	(3 subplans, two of which operate on disjoint data) */
+     cld2 = (plan_rdft *) ego->cld2;
+     cld2->apply(ego->cld2, I, O);
+     cld2rest = (plan_rdft *) ego->cld2rest;
+     if (cld2rest) {
+	  cld2rest->apply(ego->cld2rest,
+			  I + ego->rest_Ioff, O + ego->rest_Ooff);
+	  cld3 = (plan_rdft *) ego->cld3;
+	  if (cld3)
+	       cld3->apply(ego->cld3, O, O);
+	  /* else TRANSPOSED_OUT is true and user wants O transposed */
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     /* Note: this is *not* UGLY for out-of-place, destroy-input plans;
+	the planner often prefers transpose-pairwise to transpose-alltoall,
+	at least with LAM MPI on my machine. */
+     return (1
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+					  && p->I != p->O))
+	     && ONLY_TRANSPOSEDP(p->flags));
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->sched);
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-pairwise%s%(%p%)%(%p%)%(%p%)%(%p%))", 
+	      ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+/* Given a process which_pe and a number of processes npes, fills
+   the array sched[npes] with a sequence of processes to communicate
+   with for a deadlock-free, optimum-overlap all-to-all communication.
+   (All processes must call this routine to get their own schedules.)
+   The schedule can be re-ordered arbitrarily as long as all processes
+   apply the same permutation to their schedules.
+
+   The algorithm here is based upon the one described in:
+       J. A. M. Schreuder, "Constructing timetables for sport
+       competitions," Mathematical Programming Study 13, pp. 58-67 (1980). 
+   In a sport competition, you have N teams and want every team to
+   play every other team in as short a time as possible (maximum overlap
+   between games).  This timetabling problem is therefore identical
+   to that of an all-to-all communications problem.  In our case, there
+   is one wrinkle: as part of the schedule, the process must do
+   some data transfer with itself (local data movement), analogous
+   to a requirement that each team "play itself" in addition to other
+   teams.  With this wrinkle, it turns out that an optimal timetable
+   (N parallel games) can be constructed for any N, not just for even
+   N as in the original problem described by Schreuder.
+*/
+static void fill1_comm_sched(int *sched, int which_pe, int npes)
+{
+     int pe, i, n, s = 0;
+     A(which_pe >= 0 && which_pe < npes);
+     if (npes % 2 == 0) {
+	  n = npes;
+	  sched[s++] = which_pe;
+     }
+     else
+	  n = npes + 1;
+     for (pe = 0; pe < n - 1; ++pe) {
+	  if (npes % 2 == 0) {
+	       if (pe == which_pe) sched[s++] = npes - 1;
+	       else if (npes - 1 == which_pe) sched[s++] = pe;
+	  }
+	  else if (pe == which_pe) sched[s++] = pe;
+
+	  if (pe != which_pe && which_pe < n - 1) {
+	       i = (pe - which_pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe + i) % (n - 1);
+	       
+	       i = (which_pe - pe + (n - 1)) % (n - 1);
+	       if (i < n/2)
+		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
+	  }
+     }
+     A(s == npes);
+}
+
+/* Sort the communication schedule sched for npes so that the schedule
+   on process sortpe is ascending or descending (!ascending).  This is
+   necessary to allow in-place transposes when the problem does not
+   divide equally among the processes.  In this case there is one
+   process where the incoming blocks are bigger/smaller than the
+   outgoing blocks and thus have to be received in
+   descending/ascending order, respectively, to avoid overwriting data
+   before it is sent. */
+static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
+{
+     int *sortsched, i;
+     sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
+     fill1_comm_sched(sortsched, sortpe, npes);
+     if (ascending)
+	  for (i = 0; i < npes; ++i)
+	       sortsched[npes + sortsched[i]] = sched[i];
+     else
+	  for (i = 0; i < npes; ++i)
+	       sortsched[2*npes - 1 - sortsched[i]] = sched[i];
+     for (i = 0; i < npes; ++i)
+	  sched[i] = sortsched[npes + i];
+     X(ifree)(sortsched);
+}
+
+/* make the plans to do the post-MPI transpositions (shared with
+   transpose-alltoall) */
+int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
+			      R *I, R *O, int my_pe,
+			      plan **cld2, plan **cld2rest, plan **cld3,
+			      INT *rest_Ioff, INT *rest_Ooff)
+{
+     INT vn = p->vn;
+     INT b = p->block;
+     INT bt = XM(block)(p->ny, p->tblock, my_pe);
+     INT nxb = p->nx / b; /* number of equal-sized blocks */
+     INT nxr = p->nx - nxb * b; /* leftover rows after equal blocks */
+
+     *cld2 = *cld2rest = *cld3 = NULL;
+     *rest_Ioff = *rest_Ooff = 0;
+
+     if (!(p->flags & TRANSPOSED_OUT) && (nxr == 0 || I != O)) {
+	  INT nx = p->nx * vn;
+	  b *= vn;
+	  *cld2 = X(mkplan_f_d)(plnr, 
+				X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						      (nxb, bt * b, b,
+						       bt, b, nx,
+						       b, 1, 1),
+						      I, O),
+				0, 0, NO_SLOW);
+	  if (!*cld2) goto nada;
+
+	  if (nxr > 0) {
+	       *rest_Ioff = nxb * bt * b;
+	       *rest_Ooff = nxb * b;
+	       b = nxr * vn;
+	       *cld2rest = X(mkplan_f_d)(plnr,
+					 X(mkproblem_rdft_0_d)(X(mktensor_2d)
+							       (bt, b, nx,
+								b, 1, 1),
+							       I + *rest_Ioff,
+							       O + *rest_Ooff),
+                                        0, 0, NO_SLOW);
+               if (!*cld2rest) goto nada;
+	  }
+     }
+     else {
+	  *cld2 = X(mkplan_f_d)(plnr,
+				X(mkproblem_rdft_0_d)(
+				     X(mktensor_4d)
+				     (nxb, bt * b * vn, bt * b * vn,
+				      bt, b * vn, vn,
+				      b, vn, bt * vn,
+				      vn, 1, 1),
+				     I, O),
+				0, 0, NO_SLOW);
+	  if (!*cld2) goto nada;
+
+	  *rest_Ioff = *rest_Ooff = nxb * bt * b * vn;
+	  *cld2rest = X(mkplan_f_d)(plnr,
+				    X(mkproblem_rdft_0_d)(
+					 X(mktensor_3d)
+					 (bt, nxr * vn, vn,
+					  nxr, vn, bt * vn,
+					  vn, 1, 1),
+					 I + *rest_Ioff, O + *rest_Ooff),
+				    0, 0, NO_SLOW);
+	  if (!*cld2rest) goto nada;
+
+	  if (!(p->flags & TRANSPOSED_OUT)) {
+	       *cld3 = X(mkplan_f_d)(plnr,
+				     X(mkproblem_rdft_0_d)(
+					  X(mktensor_3d)
+					  (p->nx, bt * vn, vn,
+					   bt, vn, p->nx * vn,
+					   vn, 1, 1),
+					  O, O),
+				     0, 0, NO_SLOW);
+	       if (!*cld3) goto nada;
+	  }
+     }
+
+     return 1;
+
+nada:
+     X(plan_destroy_internal)(*cld3);
+     X(plan_destroy_internal)(*cld2rest);
+     X(plan_destroy_internal)(*cld2);
+     *cld2 = *cld2rest = *cld3 = NULL;
+     return 0;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     INT *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
+     R *I, *O;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+     I = p->I; O = p->O;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     b = XM(block)(p->nx, p->block, my_pe);
+     
+     if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */
+	  cld1 = X(mkplan_f_d)(plnr, 
+			       X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (b, p->ny * vn, vn,
+						      p->ny, vn, b * vn,
+						      vn, 1, 1),
+						     I, O),
+			       0, 0, NO_SLOW);
+	  if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
+
+     if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe,
+						 &cld2, &cld2rest, &cld3,
+						 &rest_Ioff, &rest_Ooff),
+		      p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
+			   XM(num_blocks)(p->ny, p->tblock));
+
+     /* Compute sizes/offsets of blocks to exchange between processors */
+     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT db, dbt; /* destination block sizes */
+	  db = XM(block)(p->nx, p->block, pe);
+	  dbt = XM(block)(p->ny, p->tblock, pe);
+
+	  sbs[pe] = b * dbt * vn;
+	  sbo[pe] = pe * (b * p->tblock) * vn;
+	  rbs[pe] = db * bt * vn;
+	  rbo[pe] = pe * (p->block * bt) * vn;
+
+	  if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
+	       A(sort_pe == -1); /* only one process should need sorting */
+	       sort_pe = pe;
+	       ascending = db * p->tblock > p->block * dbt;
+	  }
+     }
+     pln->n_pes = n_pes;
+     pln->my_pe = my_pe;
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+
+     if (my_pe >= n_pes) {
+	  pln->sched = 0; /* this process is not doing anything */
+     }
+     else {
+	  pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
+	  fill1_comm_sched(pln->sched, my_pe, n_pes);
+	  if (sort_pe >= 0)
+	       sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
+     }
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(transpose_pairwise_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+	  REGISTER_SOLVER(p, mksolver(preserve_input));
+}
--- a/fftw-3.3.10/mpi/transpose-problem.c
+++ b/fftw-3.3.10/mpi/transpose-problem.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-transpose.h"
+
+static void destroy(problem *ego_)
+{
+     problem_mpi_transpose *ego = (problem_mpi_transpose *) ego_;
+     MPI_Comm_free(&ego->comm);
+     X(ifree)(ego_);
+}
+
+static void hash(const problem *p_, md5 *m)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     int i;
+     X(md5puts)(m, "mpi-transpose");
+     X(md5int)(m, p->I == p->O);
+     /* don't include alignment -- may differ between processes
+	X(md5int)(m, X(ialignment_of)(p->I));
+	X(md5int)(m, X(ialignment_of)(p->O));
+	... note that applicability of MPI plans does not depend
+	    on alignment (although optimality may, in principle). */
+     X(md5INT)(m, p->vn);
+     X(md5INT)(m, p->nx);
+     X(md5INT)(m, p->ny);
+     X(md5INT)(m, p->block);
+     X(md5INT)(m, p->tblock);
+     MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
+     A(XM(md5_equal)(*m, p->comm));
+}
+
+static void print(const problem *ego_, printer *p)
+{
+     const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
+     int i;
+     MPI_Comm_size(ego->comm, &i);
+     p->print(p, "(mpi-transpose %d %d %d %D %D %D %D %D %d)", 
+	      ego->I == ego->O,
+	      X(ialignment_of)(ego->I),
+	      X(ialignment_of)(ego->O),
+	      ego->vn,
+	      ego->nx, ego->ny,
+	      ego->block, ego->tblock,
+	      i);
+}
+
+static void zero(const problem *ego_)
+{
+     const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
+     R *I = ego->I;
+     INT i, N = ego->vn * ego->ny;
+     int my_pe;
+
+     MPI_Comm_rank(ego->comm, &my_pe);
+     N *= XM(block)(ego->nx, ego->block, my_pe);
+
+     for (i = 0; i < N; ++i) I[i] = K(0.0);
+}
+
+static const problem_adt padt =
+{
+     PROBLEM_MPI_TRANSPOSE,
+     hash,
+     zero,
+     print,
+     destroy
+};
+
+problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
+				 R *I, R *O,
+				 INT block, INT tblock,
+				 MPI_Comm comm,
+				 unsigned flags)
+{
+     problem_mpi_transpose *ego =
+          (problem_mpi_transpose *)X(mkproblem)(sizeof(problem_mpi_transpose), &padt);
+
+     A(nx > 0 && ny > 0 && vn > 0);
+     A(block > 0 && XM(num_blocks_ok)(nx, block, comm)
+       && tblock > 0 && XM(num_blocks_ok)(ny, tblock, comm));
+
+     /* enforce pointer equality if untainted pointers are equal */
+     if (UNTAINT(I) == UNTAINT(O))
+	  I = O = JOIN_TAINT(I, O);
+
+     ego->nx = nx;
+     ego->ny = ny;
+     ego->vn = vn;
+     ego->I = I;
+     ego->O = O;
+     ego->block = block > nx ? nx : block;
+     ego->tblock = tblock > ny ? ny : tblock;
+
+     /* canonicalize flags: we can freely assume that the data is
+	"transposed" if one of the dimensions is 1. */
+     if (ego->block == 1)
+	  flags |= TRANSPOSED_IN;
+     if (ego->tblock == 1)
+	  flags |= TRANSPOSED_OUT;
+     ego->flags = flags;
+
+     MPI_Comm_dup(comm, &ego->comm);
+
+     return &(ego->super);
+}
--- a/fftw-3.3.10/mpi/transpose-recurse.c
+++ b/fftw-3.3.10/mpi/transpose-recurse.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Recursive "radix-r" distributed transpose, which breaks a transpose
+   over p processes into p/r transposes over r processes plus r
+   transposes over p/r processes.  If performed recursively, this
+   produces a total of O(p log p) messages vs. O(p^2) messages for a
+   direct approach.
+
+   However, this is not necessarily an improvement.  The total size of
+   all the messages is actually increased from O(N) to O(N log p)
+   where N is the total data size.  Also, the amount of local data
+   rearrangement is increased.  So, it's not clear, a priori, what the
+   best algorithm will be, and we'll leave it to the planner.  (In
+   theory and practice, it looks like this becomes advantageous for
+   large p, in the limit where the message sizes are small and
+   latency-dominated.)
+*/
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int (*radix)(int np);
+     const char *nam;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cldtr, *cldtm;
+     int preserve_input;
+
+     int r; /* "radix" */
+     const char *nam;
+} P;
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1, *cldtr, *cldtm;
+
+     cld1 = (plan_rdft *) ego->cld1;
+     if (cld1) cld1->apply((plan *) cld1, I, O);
+
+     if (ego->preserve_input) I = O;
+
+     cldtr = (plan_rdft *) ego->cldtr;
+     if (cldtr) cldtr->apply((plan *) cldtr, O, I);
+
+     cldtm = (plan_rdft *) ego->cldtm;
+     if (cldtm) cldtm->apply((plan *) cldtm, I, O);
+}
+
+static int radix_sqrt(int np)
+{
+     int r;
+     for (r = (int) (X(isqrt)(np)); np % r != 0; ++r)
+	  ;
+     return r;
+}
+
+static int radix_first(int np)
+{
+     int r = (int) (X(first_divisor)(np));
+     return (r >= (int) (X(isqrt)(np)) ? 0 : r);
+}
+
+/* the local allocated space on process pe required for the given transpose
+   dimensions and block sizes */
+static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe)
+{
+     return X(imax)(XM(block)(nx, block, pe) * ny,
+		    nx * XM(block)(ny, tblock, pe));
+}
+
+/* check whether the recursive transposes fit within the space
+   that must have been allocated on each process for this transpose;
+   this must be modified if the subdivision in mkplan is changed! */
+static int enough_space(INT nx, INT ny, INT block, INT tblock,
+			int r, int n_pes)
+{
+     int pe;
+     int m = n_pes / r;
+     for (pe = 0; pe < n_pes; ++pe) {
+	  INT space = transpose_space(nx, ny, block, tblock, pe);
+	  INT b1 = XM(block)(nx, r * block, pe / r);
+	  INT b2 = XM(block)(ny, m * tblock, pe % r);
+	  if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
+	      || transpose_space(nx, b2, r*block, tblock, pe / r) > space)
+	       return 0;
+     }
+     return 1;
+}
+
+/* In theory, transpose-recurse becomes advantageous for message sizes
+   below some minimum, assuming that the time is dominated by
+   communications.  In practice, we want to constrain the minimum
+   message size for transpose-recurse to keep the planning time down.
+   I've set this conservatively according to some simple experiments
+   on a Cray XT3 where the crossover message size was 128, although on
+   a larger-latency machine the crossover will be larger. */
+#define SMALL_MESSAGE 2048
+
+static int applicable(const S *ego, const problem *p_,
+		      const planner *plnr, int *r)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     int n_pes;
+     MPI_Comm_size(p->comm, &n_pes);
+     return (1
+	     && p->tblock * n_pes == p->ny
+	     && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+                                          && p->I != p->O))
+	     && (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1
+	     && enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes)
+	     && (!CONSERVE_MEMORYP(plnr) || *r > 8
+		 || !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r))
+	     && (!NO_SLOWP(plnr) || 
+		 (p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE)
+	     && ONLY_TRANSPOSEDP(p->flags)
+	  );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cldtr, wakefulness);
+     X(plan_awake)(ego->cldtm, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(plan_destroy_internal)(ego->cldtm);
+     X(plan_destroy_internal)(ego->cldtr);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))",
+	      ego->nam, ego->r, ego->preserve_input==2 ?"/p":"",
+	      ego->cld1, ego->cldtr, ego->cldtm);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
+     R *I, *O;
+     int me, np, r, m;
+     INT b;
+     MPI_Comm comm2;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr, &r))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+
+     MPI_Comm_size(p->comm, &np);
+     MPI_Comm_rank(p->comm, &me);
+     m = np / r;
+     A(r * m == np);
+
+     I = p->I; O = p->O;
+
+     b = XM(block)(p->nx, p->block, me);
+     A(p->tblock * np == p->ny); /* this is currently required for cld1 */
+     if (p->flags & TRANSPOSED_IN) { 
+          /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
+	  INT vn = p->vn * b * p->tblock;
+	  cld1 = X(mkplan_f_d)(plnr,
+                               X(mkproblem_rdft_0_d)(X(mktensor_3d)
+						     (m, r*vn, vn,
+						      r, vn, m*vn,
+						      vn, 1, 1),
+                                                     I, O),
+                               0, 0, NO_SLOW);
+     }
+     else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
+          /* b x m x r x bt x vn -> r x m x bt x b x vn */
+	  INT vn = p->vn;
+	  INT bt = p->tblock;
+	  cld1 = X(mkplan_f_d)(plnr,
+                               X(mkproblem_rdft_0_d)(X(mktensor_5d)
+						     (b, m*r*bt*vn, vn,
+						      m, r*bt*vn, bt*b*vn,
+						      r, bt*vn, m*bt*b*vn,
+						      bt, vn, b*vn,
+						      vn, 1, 1),
+                                                     I, O),
+                               0, 0, NO_SLOW);
+     }
+     else { /* TRANSPOSED_IN permutation must be separate for in-place */
+	  /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
+	  INT vn = p->vn * p->tblock;
+	  cld1 = X(mkplan_f_d)(plnr,
+                               X(mkproblem_rdft_0_d)(X(mktensor_4d)
+						     (m, r*vn, vn,
+						      r, vn, m*vn,
+						      vn, 1, 1,
+						      b, np*vn, np*vn),
+                                                     I, O),
+                               0, 0, NO_SLOW);
+     }
+     if (XM(any_true)(!cld1, p->comm)) goto nada;
+
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
+
+     b = XM(block)(p->nx, r * p->block, me / r);
+     MPI_Comm_split(p->comm, me / r, me, &comm2);
+     if (b)
+	  cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
+			      (b, p->ny, p->vn,
+			       O, I, p->block, m * p->tblock, comm2, 
+			       p->I != p->O
+			       ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
+     MPI_Comm_free(&comm2);
+     if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
+     
+     b = XM(block)(p->ny, m * p->tblock, me % r);
+     MPI_Comm_split(p->comm, me % r, me, &comm2);
+     if (b)
+	  cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
+			      (p->nx, b, p->vn,
+			       I, O, r * p->block, p->tblock, comm2, 
+			       TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
+     MPI_Comm_free(&comm2);
+     if (XM(any_true)(b && !cldtm, p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cldtr = cldtr;
+     pln->cldtm = cldtm;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+     pln->r = r;
+     pln->nam = ego->nam;
+
+     pln->super.super.ops = cld1->ops;
+     if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
+     if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cldtm);
+     X(plan_destroy_internal)(cldtr);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input,
+			int (*radix)(int np), const char *nam)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     slv->radix = radix;
+     slv->nam = nam;
+     return &(slv->super);
+}
+
+void XM(transpose_recurse_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input) {
+	  REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt"));
+	  REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first"));
+     }
+}
--- a/fftw-3.3.10/mpi/transpose-solve.c
+++ b/fftw-3.3.10/mpi/transpose-solve.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "mpi-transpose.h"
+
+/* use the apply() operation for MPI_TRANSPOSE problems */
+void XM(transpose_solve)(const plan *ego_, const problem *p_)
+{
+     const plan_mpi_transpose *ego = (const plan_mpi_transpose *) ego_;
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
+}
--- a/fftw-3.3.10/mpi/wisdom-api.c
+++ b/fftw-3.3.10/mpi/wisdom-api.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include "fftw3-mpi.h"
+#include "ifftw-mpi.h"
+#include <string.h>
+
+#if SIZEOF_SIZE_T == SIZEOF_UNSIGNED_INT
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED
+#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
+#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG_LONG
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG_LONG
+#else
+#  error MPI type for size_t is unknown
+#  define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
+#endif
+
+/* Import wisdom from all processes to process 0, as prelude to
+   exporting a single wisdom file (this is convenient when we are
+   running on identical processors, to avoid the annoyance of having
+   per-process wisdom files).  In order to make the time for this
+   operation logarithmic in the number of processors (rather than
+   linear), we employ a tree reduction algorithm.  This means that the
+   wisdom is modified on processes other than root, which shouldn't
+   matter in practice. */
+void XM(gather_wisdom)(MPI_Comm comm_)
+{
+     MPI_Comm comm, comm2;
+     int my_pe, n_pes;
+     char *wis;
+     size_t wislen;
+     MPI_Status status;
+
+     MPI_Comm_dup(comm_, &comm);
+     MPI_Comm_rank(comm, &my_pe);
+     MPI_Comm_size(comm, &n_pes);
+
+     if (n_pes > 2) { /* recursively split into even/odd processes */
+	  MPI_Comm_split(comm, my_pe % 2, my_pe, &comm2);
+	  XM(gather_wisdom)(comm2);
+	  MPI_Comm_free(&comm2);
+     }
+     if (n_pes > 1 && my_pe < 2) { /* import process 1 -> 0 */
+	  if (my_pe == 1) {
+	       wis = X(export_wisdom_to_string)();
+	       wislen = strlen(wis) + 1;
+	       MPI_Send(&wislen, 1, FFTW_MPI_SIZE_T, 0, 111, comm);
+	       MPI_Send(wis, wislen, MPI_CHAR, 0, 222, comm);
+	       free(wis);
+	  }
+	  else /* my_pe == 0 */ {
+	       MPI_Recv(&wislen, 1, FFTW_MPI_SIZE_T, 1, 111, comm, &status);
+	       wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
+	       MPI_Recv(wis, wislen, MPI_CHAR, 1, 222, comm, &status);
+	       if (!X(import_wisdom_from_string)(wis))
+		    MPI_Abort(comm, 1);
+	       X(ifree)(wis);
+	  }
+     }
+     MPI_Comm_free(&comm);
+}
+
+/* broadcast wisdom from process 0 to all other processes; this
+   is useful so that we can import wisdom once and not worry
+   about parallel I/O or process-specific wisdom, although of
+   course it assumes that all the processes have identical
+   performance characteristics (i.e. identical hardware). */
+void XM(broadcast_wisdom)(MPI_Comm comm_)
+{
+     MPI_Comm comm;
+     int my_pe;
+     char *wis;
+     size_t wislen;
+
+     MPI_Comm_dup(comm_, &comm);
+     MPI_Comm_rank(comm, &my_pe);
+
+     if (my_pe != 0) {
+	  MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
+	  wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
+	  MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
+	  if (!X(import_wisdom_from_string)(wis))
+	       MPI_Abort(comm, 1);
+	  X(ifree)(wis);
+     }
+     else /* my_pe == 0 */ {
+	  wis = X(export_wisdom_to_string)();
+	  wislen = strlen(wis) + 1;
+	  MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
+	  MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
+	  X(free)(wis);
+     }
+     MPI_Comm_free(&comm);
+}