This commit is contained in:
2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions

101
fftw-3.3.10/mpi/Makefile.am Normal file
View File

@@ -0,0 +1,101 @@
# -I $(top_srcdir)/api is necessary because fftw3-mpi.h includes
# "fftw3.h", and we cannot change the latter to "api/fftw3.h" because
# fftw3-mpi.h is installed in /usr/include.
AM_CPPFLAGS = -I $(top_srcdir) -I $(top_srcdir)/api
if MPI
lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_mpi.la
include_HEADERS = fftw3-mpi.h
nodist_include_HEADERS = fftw3-mpi.f03 fftw3l-mpi.f03
noinst_PROGRAMS = mpi-bench
endif
CC=@MPICC@
EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in fftw3l-mpi.f03.in
BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
SRC = any-true.c api.c block.c choose-radix.c conf.c dtensor.c fftw3-mpi.h ifftw-mpi.h rearrange.c wisdom-api.c f03-wrap.c
libfftw3@PREC_SUFFIX@_mpi_la_SOURCES = $(SRC) $(TRANSPOSE_SRC) $(DFT_SRC) $(RDFT_SRC) $(RDFT2_SRC)
libfftw3@PREC_SUFFIX@_mpi_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
libfftw3@PREC_SUFFIX@_mpi_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la @MPILIBS@
if THREADS
mpi_bench_CFLAGS = $(PTHREAD_CFLAGS)
if !COMBINED_THREADS
LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
endif
else
if OPENMP
mpi_bench_CFLAGS = $(OPENMP_CFLAGS)
LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
endif
endif
mpi_bench_SOURCES = mpi-bench.c $(top_srcdir)/tests/fftw-bench.c $(top_srcdir)/tests/hook.c
mpi_bench_LDADD = libfftw3@PREC_SUFFIX@_mpi.la $(LIBFFTWTHREADS) $(top_builddir)/libfftw3@PREC_SUFFIX@.la $(top_builddir)/libbench2/libbench2.a $(MPILIBS) $(THREADLIBS)
CHECK = $(top_srcdir)/tests/check.pl
NUMCHECK=10
CHECKSIZE=10000
CHECKOPTS = --verbose --random --maxsize=$(CHECKSIZE) -c=$(NUMCHECK) $(CHECK_PL_OPTS)
if MPI
check-local: mpi-bench$(EXEEXT)
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 1 `pwd`/mpi-bench"
@echo "--------------------------------------------------------------"
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 1 CPU"
@echo "--------------------------------------------------------------"
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 2 `pwd`/mpi-bench"
@echo "--------------------------------------------------------------"
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 2 CPUs"
@echo "--------------------------------------------------------------"
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 3 `pwd`/mpi-bench"
@echo "--------------------------------------------------------------"
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 3 CPUs"
@echo "--------------------------------------------------------------"
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 4 `pwd`/mpi-bench"
@echo "--------------------------------------------------------------"
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 4 CPUs"
@echo "--------------------------------------------------------------"
if SMP
perl -w $(CHECK) $(CHECKOPTS) --mpi --nthreads=2 "$(MPIRUN) -np 3 `pwd`/mpi-bench"
@echo "--------------------------------------------------------------"
@echo " MPI FFTW threaded transforms passed "$(NUMCHECK)" tests!"
@echo "--------------------------------------------------------------"
endif
bigcheck: mpi-bench$(EXEEXT)
$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=100 CHECKSIZE=60000 check-local
smallcheck: mpi-bench$(EXEEXT)
$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=2 check-local
endif
fftw3-mpi.f03: fftw3-mpi.f03.in Makefile
sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3-mpi.f03.in > $@
fftw3l-mpi.f03: fftw3l-mpi.f03.in Makefile
sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3l-mpi.f03.in > $@
if MAINTAINER_MODE
fftw3-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
sh $(srcdir)/f03api.sh d f > $@
fftw3l-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
sh $(srcdir)/f03api.sh l | grep -v parameter | sed 's/fftw3.f03/fftw3l.f03/' > $@
f03-wrap.c: fftw3-mpi.h f03-wrap.sh genf03-wrap.pl
sh $(srcdir)/f03-wrap.sh > $@
endif

1037
fftw-3.3.10/mpi/Makefile.in Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,59 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* During planning, if any process fails to create a plan then
all of the processes must fail. This synchronization is implemented
by the following routine.
Instead of
if (failure) goto nada;
we instead do:
if (any_true(failure, comm)) goto nada;
*/
int XM(any_true)(int condition, MPI_Comm comm)
{
int result;
MPI_Allreduce(&condition, &result, 1, MPI_INT, MPI_LOR, comm);
return result;
}
/***********************************************************************/
#if defined(FFTW_DEBUG)
/* for debugging, we include an assertion to make sure that
MPI problems all produce equal hashes, as checked by this routine: */
int XM(md5_equal)(md5 m, MPI_Comm comm)
{
unsigned long s0[4];
int i, eq_me, eq_all;
X(md5end)(&m);
for (i = 0; i < 4; ++i) s0[i] = m.s[i];
MPI_Bcast(s0, 4, MPI_UNSIGNED_LONG, 0, comm);
for (i = 0; i < 4 && s0[i] == m.s[i]; ++i) ;
eq_me = i == 4;
MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
return eq_all;
}
#endif

907
fftw-3.3.10/mpi/api.c Normal file
View File

@@ -0,0 +1,907 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "api/api.h"
#include "fftw3-mpi.h"
#include "ifftw-mpi.h"
#include "mpi-transpose.h"
#include "mpi-dft.h"
#include "mpi-rdft.h"
#include "mpi-rdft2.h"
/* Convert API flags to internal MPI flags. */
#define MPI_FLAGS(f) ((f) >> 27)
/*************************************************************************/
static int mpi_inited = 0;
static MPI_Comm problem_comm(const problem *p) {
switch (p->adt->problem_kind) {
case PROBLEM_MPI_DFT:
return ((const problem_mpi_dft *) p)->comm;
case PROBLEM_MPI_RDFT:
return ((const problem_mpi_rdft *) p)->comm;
case PROBLEM_MPI_RDFT2:
return ((const problem_mpi_rdft2 *) p)->comm;
case PROBLEM_MPI_TRANSPOSE:
return ((const problem_mpi_transpose *) p)->comm;
default:
return MPI_COMM_NULL;
}
}
/* used to synchronize cost measurements (timing or estimation)
across all processes for an MPI problem, which is critical to
ensure that all processes decide to use the same MPI plans
(whereas serial plans need not be syncronized). */
static double cost_hook(const problem *p, double t, cost_kind k)
{
MPI_Comm comm = problem_comm(p);
double tsum;
if (comm == MPI_COMM_NULL) return t;
MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE,
k == COST_SUM ? MPI_SUM : MPI_MAX, comm);
return tsum;
}
/* Used to reject wisdom that is not in sync across all processes
for an MPI problem, which is critical to ensure that all processes
decide to use the same MPI plans. (Even though costs are synchronized,
above, out-of-sync wisdom may result from plans being produced
by communicators that do not span all processes, either from a
user-specified communicator or e.g. from transpose-recurse. */
static int wisdom_ok_hook(const problem *p, flags_t flags)
{
MPI_Comm comm = problem_comm(p);
int eq_me, eq_all;
/* unpack flags bitfield, since MPI communications may involve
byte-order changes and MPI cannot do this for bit fields */
#if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */
unsigned int f[5];
#else
unsigned long f[5]; /* at least 32 bits as per C standard */
#endif
if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */
if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */
/* otherwise, check that the flags and solver index are identical
on all processes in this problem's communicator.
TO DO: possibly we can relax strict equality, but it is
critical to ensure that any flags which affect what plan is
created (and whether the solver is applicable) are the same,
e.g. DESTROY_INPUT, NO_UGLY, etcetera. (If the MPI algorithm
differs between processes, deadlocks/crashes generally result.) */
f[0] = flags.l;
f[1] = flags.hash_info;
f[2] = flags.timelimit_impatience;
f[3] = flags.u;
f[4] = flags.slvndx;
MPI_Bcast(f, 5,
SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG,
0, comm);
eq_me = f[0] == flags.l && f[1] == flags.hash_info
&& f[2] == flags.timelimit_impatience
&& f[3] == flags.u && f[4] == flags.slvndx;
MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
return eq_all;
}
/* This hook is called when wisdom is not found. The any_true here
matches up with the any_true in wisdom_ok_hook, in order to handle
the case where some processes had wisdom (and called wisdom_ok_hook)
and some processes didn't have wisdom (and called nowisdom_hook). */
static void nowisdom_hook(const problem *p)
{
MPI_Comm comm = problem_comm(p);
if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */
XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */
}
/* needed to synchronize planner bogosity flag, in case non-MPI problems
on a subset of processes encountered bogus wisdom */
static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p)
{
MPI_Comm comm = problem_comm(p);
if (comm != MPI_COMM_NULL /* an MPI problem */
&& XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */
return WISDOM_IS_BOGUS;
return state;
}
void XM(init)(void)
{
if (!mpi_inited) {
planner *plnr = X(the_planner)();
plnr->cost_hook = cost_hook;
plnr->wisdom_ok_hook = wisdom_ok_hook;
plnr->nowisdom_hook = nowisdom_hook;
plnr->bogosity_hook = bogosity_hook;
XM(conf_standard)(plnr);
mpi_inited = 1;
}
}
void XM(cleanup)(void)
{
X(cleanup)();
mpi_inited = 0;
}
/*************************************************************************/
static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0)
{
dtensor *x = XM(mkdtensor)(rnk);
int i;
for (i = 0; i < rnk; ++i) {
x->dims[i].n = dims0[i].n;
x->dims[i].b[IB] = dims0[i].ib;
x->dims[i].b[OB] = dims0[i].ob;
}
return x;
}
static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes,
int rdft2)
{
dtensor *sz = XM(mkdtensor)(rnk);
dtensor *sz0 = mkdtensor_api(rnk, dims0);
block_kind k;
int i;
for (i = 0; i < rnk; ++i)
sz->dims[i].n = dims0[i].n;
if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
for (i = 0; i < rnk; ++i) {
sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n;
sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n;
}
/* If we haven't used all of the processes yet, and some of the
block sizes weren't specified (i.e. 0), then set the
unspecified blocks so as to use as many processes as
possible with as few distributed dimensions as possible. */
FORALL_BLOCK_KIND(k) {
INT nb = XM(num_blocks_total)(sz, k);
INT np = n_pes / nb;
for (i = 0; i < rnk && np > 1; ++i)
if (!sz0->dims[i].b[k]) {
sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np);
nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]);
np = n_pes / nb;
}
}
if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n;
/* punt for 1d prime */
if (rnk == 1 && X(is_prime)(sz->dims[0].n))
sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n;
XM(dtensor_destroy)(sz0);
sz0 = XM(dtensor_canonical)(sz, 0);
XM(dtensor_destroy)(sz);
return sz0;
}
/* allocate simple local (serial) dims array corresponding to n[rnk] */
static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n)
{
XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk,
TENSORS);
int i;
for (i = 0; i < rnk; ++i)
dims[i].n = dims[i].ib = dims[i].ob = n[i];
return dims;
}
/*************************************************************************/
static void local_size(int my_pe, const dtensor *sz, block_kind k,
ptrdiff_t *local_n, ptrdiff_t *local_start)
{
int i;
if (my_pe >= XM(num_blocks_total)(sz, k))
for (i = 0; i < sz->rnk; ++i)
local_n[i] = local_start[i] = 0;
else {
XM(block_coords)(sz, k, my_pe, local_start);
for (i = 0; i < sz->rnk; ++i) {
local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k],
local_start[i]);
local_start[i] *= sz->dims[i].b[k];
}
}
}
static INT prod(int rnk, const ptrdiff_t *local_n)
{
int i;
INT N = 1;
for (i = 0; i < rnk; ++i) N *= local_n[i];
return N;
}
ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0,
ptrdiff_t howmany, MPI_Comm comm,
ptrdiff_t *local_n_in,
ptrdiff_t *local_start_in,
ptrdiff_t *local_n_out,
ptrdiff_t *local_start_out,
int sign, unsigned flags)
{
INT N;
int my_pe, n_pes, i;
dtensor *sz;
if (rnk == 0)
return howmany;
MPI_Comm_rank(comm, &my_pe);
MPI_Comm_size(comm, &n_pes);
sz = default_sz(rnk, dims0, n_pes, 0);
/* Now, we must figure out how much local space the user should
allocate (or at least an upper bound). This depends strongly
on the exact algorithms we employ...ugh! FIXME: get this info
from the solvers somehow? */
N = 1; /* never return zero allocation size */
if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) {
INT Nafter;
ddim odims[2];
/* dft-rank-geq2-transposed */
odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */
/* we may need extra space for transposed intermediate data */
for (i = 0; i < 2; ++i)
if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 &&
XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) {
sz->dims[i].b[IB]
= XM(default_block)(sz->dims[i].n, n_pes);
sz->dims[1-i].b[IB] = sz->dims[1-i].n;
local_size(my_pe, sz, IB, local_n_in, local_start_in);
N = X(imax)(N, prod(rnk, local_n_in));
sz->dims[i] = odims[i];
sz->dims[1-i] = odims[1-i];
break;
}
/* dft-rank-geq2 */
Nafter = howmany;
for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
N = X(imax)(N, (sz->dims[0].n
* XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
my_pe) + howmany - 1) / howmany);
/* dft-rank-geq2 with dimensions swapped */
Nafter = howmany * sz->dims[0].n;
for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
N = X(imax)(N, (sz->dims[1].n
* XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
my_pe) + howmany - 1) / howmany);
}
else if (rnk == 1) {
if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */
ptrdiff_t n[2], start[2];
dtensor *sz2 = XM(mkdtensor)(2);
sz2->dims[0] = sz->dims[0];
sz2->dims[0].b[IB] = sz->dims[0].n;
sz2->dims[1].n = sz2->dims[1].b[OB] = howmany;
sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes);
local_size(my_pe, sz2, IB, n, start);
XM(dtensor_destroy)(sz2);
N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany);
}
else { /* dft-rank1 */
INT r, m, rblock[2], mblock[2];
/* Since the 1d transforms are so different, we require
the user to call local_size_1d for this case. Ugh. */
CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD);
if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign,
rblock, mblock))) {
m = sz->dims[0].n / r;
if (flags & FFTW_MPI_SCRAMBLED_IN)
sz->dims[0].b[IB] = rblock[IB] * m;
else { /* !SCRAMBLED_IN */
sz->dims[0].b[IB] = r * mblock[IB];
N = X(imax)(N, rblock[IB] * m);
}
if (flags & FFTW_MPI_SCRAMBLED_OUT)
sz->dims[0].b[OB] = r * mblock[OB];
else { /* !SCRAMBLED_OUT */
N = X(imax)(N, r * mblock[OB]);
sz->dims[0].b[OB] = rblock[OB] * m;
}
}
}
}
local_size(my_pe, sz, IB, local_n_in, local_start_in);
local_size(my_pe, sz, OB, local_n_out, local_start_out);
/* at least, make sure we have enough space to store input & output */
N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out)));
XM(dtensor_destroy)(sz);
return N * howmany;
}
ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n,
ptrdiff_t howmany,
ptrdiff_t xblock, ptrdiff_t yblock,
MPI_Comm comm,
ptrdiff_t *local_nx,
ptrdiff_t *local_x_start,
ptrdiff_t *local_ny,
ptrdiff_t *local_y_start)
{
ptrdiff_t N;
XM(ddim) *dims;
ptrdiff_t *local;
if (rnk == 0) {
*local_nx = *local_ny = 1;
*local_x_start = *local_y_start = 0;
return howmany;
}
dims = simple_dims(rnk, n);
local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS);
/* default 1d block distribution, with transposed output
if yblock < n[1] */
dims[0].ib = xblock;
if (rnk > 1) {
if (yblock < n[1])
dims[1].ob = yblock;
else
dims[0].ob = xblock;
}
else
dims[0].ob = xblock; /* FIXME: 1d not really supported here
since we don't have flags/sign */
N = XM(local_size_guru)(rnk, dims, howmany, comm,
local, local + rnk,
local + 2*rnk, local + 3*rnk,
0, 0);
*local_nx = local[0];
*local_x_start = local[rnk];
if (rnk > 1) {
*local_ny = local[2*rnk + 1];
*local_y_start = local[3*rnk + 1];
}
else {
*local_ny = *local_nx;
*local_y_start = *local_x_start;
}
X(ifree)(local);
X(ifree)(dims);
return N;
}
ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n,
ptrdiff_t howmany,
ptrdiff_t xblock,
MPI_Comm comm,
ptrdiff_t *local_nx,
ptrdiff_t *local_x_start)
{
ptrdiff_t local_ny, local_y_start;
return XM(local_size_many_transposed)(rnk, n, howmany,
xblock, rnk > 1
? n[1] : FFTW_MPI_DEFAULT_BLOCK,
comm,
local_nx, local_x_start,
&local_ny, &local_y_start);
}
ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n,
MPI_Comm comm,
ptrdiff_t *local_nx,
ptrdiff_t *local_x_start,
ptrdiff_t *local_ny,
ptrdiff_t *local_y_start)
{
return XM(local_size_many_transposed)(rnk, n, 1,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
comm,
local_nx, local_x_start,
local_ny, local_y_start);
}
ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n,
MPI_Comm comm,
ptrdiff_t *local_nx,
ptrdiff_t *local_x_start)
{
return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm,
local_nx, local_x_start);
}
ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany,
MPI_Comm comm, int sign, unsigned flags,
ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
{
XM(ddim) d;
d.n = nx;
d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK;
return XM(local_size_guru)(1, &d, howmany, comm,
local_nx, local_x_start,
local_ny, local_y_start, sign, flags);
}
ptrdiff_t XM(local_size_1d)(ptrdiff_t nx,
MPI_Comm comm, int sign, unsigned flags,
ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
{
return XM(local_size_many_1d)(nx, 1, comm, sign, flags,
local_nx, local_x_start,
local_ny, local_y_start);
}
ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
MPI_Comm comm,
ptrdiff_t *local_nx,
ptrdiff_t *local_x_start,
ptrdiff_t *local_ny,
ptrdiff_t *local_y_start)
{
ptrdiff_t n[2];
n[0] = nx; n[1] = ny;
return XM(local_size_transposed)(2, n, comm,
local_nx, local_x_start,
local_ny, local_y_start);
}
ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm,
ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
{
ptrdiff_t n[2];
n[0] = nx; n[1] = ny;
return XM(local_size)(2, n, comm, local_nx, local_x_start);
}
ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
ptrdiff_t nz,
MPI_Comm comm,
ptrdiff_t *local_nx,
ptrdiff_t *local_x_start,
ptrdiff_t *local_ny,
ptrdiff_t *local_y_start)
{
ptrdiff_t n[3];
n[0] = nx; n[1] = ny; n[2] = nz;
return XM(local_size_transposed)(3, n, comm,
local_nx, local_x_start,
local_ny, local_y_start);
}
ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
MPI_Comm comm,
ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
{
ptrdiff_t n[3];
n[0] = nx; n[1] = ny; n[2] = nz;
return XM(local_size)(3, n, comm, local_nx, local_x_start);
}
/*************************************************************************/
/* Transpose API */
X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny,
ptrdiff_t howmany,
ptrdiff_t xblock, ptrdiff_t yblock,
R *in, R *out,
MPI_Comm comm, unsigned flags)
{
int n_pes;
XM(init)();
if (howmany < 0 || xblock < 0 || yblock < 0 ||
nx <= 0 || ny <= 0) return 0;
MPI_Comm_size(comm, &n_pes);
if (!xblock) xblock = XM(default_block)(nx, n_pes);
if (!yblock) yblock = XM(default_block)(ny, n_pes);
if (n_pes < XM(num_blocks)(nx, xblock)
|| n_pes < XM(num_blocks)(ny, yblock))
return 0;
return
X(mkapiplan)(FFTW_FORWARD, flags,
XM(mkproblem_transpose)(nx, ny, howmany,
in, out, xblock, yblock,
comm, MPI_FLAGS(flags)));
}
X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
MPI_Comm comm, unsigned flags)
{
return XM(plan_many_transpose)(nx, ny, 1,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
in, out, comm, flags);
}
/*************************************************************************/
/* Complex DFT API */
X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0,
ptrdiff_t howmany,
C *in, C *out,
MPI_Comm comm, int sign, unsigned flags)
{
int n_pes, i;
dtensor *sz;
XM(init)();
if (howmany < 0 || rnk < 1) return 0;
for (i = 0; i < rnk; ++i)
if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
return 0;
MPI_Comm_size(comm, &n_pes);
sz = default_sz(rnk, dims0, n_pes, 0);
if (XM(num_blocks_total)(sz, IB) > n_pes
|| XM(num_blocks_total)(sz, OB) > n_pes) {
XM(dtensor_destroy)(sz);
return 0;
}
return
X(mkapiplan)(sign, flags,
XM(mkproblem_dft_d)(sz, howmany,
(R *) in, (R *) out,
comm, sign,
MPI_FLAGS(flags)));
}
X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n,
ptrdiff_t howmany,
ptrdiff_t iblock, ptrdiff_t oblock,
C *in, C *out,
MPI_Comm comm, int sign, unsigned flags)
{
XM(ddim) *dims = simple_dims(rnk, n);
X(plan) pln;
if (rnk == 1) {
dims[0].ib = iblock;
dims[0].ob = oblock;
}
else if (rnk > 1) {
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
}
pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags);
X(ifree)(dims);
return pln;
}
X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out,
MPI_Comm comm, int sign, unsigned flags)
{
return XM(plan_many_dft)(rnk, n, 1,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
in, out, comm, sign, flags);
}
X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out,
MPI_Comm comm, int sign, unsigned flags)
{
return XM(plan_dft)(1, &nx, in, out, comm, sign, flags);
}
X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out,
MPI_Comm comm, int sign, unsigned flags)
{
ptrdiff_t n[2];
n[0] = nx; n[1] = ny;
return XM(plan_dft)(2, n, in, out, comm, sign, flags);
}
X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
C *in, C *out,
MPI_Comm comm, int sign, unsigned flags)
{
ptrdiff_t n[3];
n[0] = nx; n[1] = ny; n[2] = nz;
return XM(plan_dft)(3, n, in, out, comm, sign, flags);
}
/*************************************************************************/
/* R2R API */
X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0,
ptrdiff_t howmany,
R *in, R *out,
MPI_Comm comm, const X(r2r_kind) *kind,
unsigned flags)
{
int n_pes, i;
dtensor *sz;
rdft_kind *k;
X(plan) pln;
XM(init)();
if (howmany < 0 || rnk < 1) return 0;
for (i = 0; i < rnk; ++i)
if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
return 0;
k = X(map_r2r_kind)(rnk, kind);
MPI_Comm_size(comm, &n_pes);
sz = default_sz(rnk, dims0, n_pes, 0);
if (XM(num_blocks_total)(sz, IB) > n_pes
|| XM(num_blocks_total)(sz, OB) > n_pes) {
XM(dtensor_destroy)(sz);
return 0;
}
pln = X(mkapiplan)(0, flags,
XM(mkproblem_rdft_d)(sz, howmany,
in, out,
comm, k, MPI_FLAGS(flags)));
X(ifree0)(k);
return pln;
}
X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n,
ptrdiff_t howmany,
ptrdiff_t iblock, ptrdiff_t oblock,
R *in, R *out,
MPI_Comm comm, const X(r2r_kind) *kind,
unsigned flags)
{
XM(ddim) *dims = simple_dims(rnk, n);
X(plan) pln;
if (rnk == 1) {
dims[0].ib = iblock;
dims[0].ob = oblock;
}
else if (rnk > 1) {
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
}
pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags);
X(ifree)(dims);
return pln;
}
X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out,
MPI_Comm comm,
const X(r2r_kind) *kind,
unsigned flags)
{
return XM(plan_many_r2r)(rnk, n, 1,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
in, out, comm, kind, flags);
}
X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
MPI_Comm comm,
X(r2r_kind) kindx, X(r2r_kind) kindy,
unsigned flags)
{
ptrdiff_t n[2];
X(r2r_kind) kind[2];
n[0] = nx; n[1] = ny;
kind[0] = kindx; kind[1] = kindy;
return XM(plan_r2r)(2, n, in, out, comm, kind, flags);
}
X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
R *in, R *out,
MPI_Comm comm,
X(r2r_kind) kindx, X(r2r_kind) kindy,
X(r2r_kind) kindz,
unsigned flags)
{
ptrdiff_t n[3];
X(r2r_kind) kind[3];
n[0] = nx; n[1] = ny; n[2] = nz;
kind[0] = kindx; kind[1] = kindy; kind[2] = kindz;
return XM(plan_r2r)(3, n, in, out, comm, kind, flags);
}
/*************************************************************************/
/* R2C/C2R API */
static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0,
ptrdiff_t howmany,
R *r, C *c,
MPI_Comm comm, rdft_kind kind, unsigned flags)
{
int n_pes, i;
dtensor *sz;
R *cr = (R *) c;
XM(init)();
if (howmany < 0 || rnk < 2) return 0;
for (i = 0; i < rnk; ++i)
if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
return 0;
MPI_Comm_size(comm, &n_pes);
sz = default_sz(rnk, dims0, n_pes, 1);
sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
if (XM(num_blocks_total)(sz, IB) > n_pes
|| XM(num_blocks_total)(sz, OB) > n_pes) {
XM(dtensor_destroy)(sz);
return 0;
}
sz->dims[rnk-1].n = dims0[rnk-1].n;
if (kind == R2HC)
return X(mkapiplan)(0, flags,
XM(mkproblem_rdft2_d)(sz, howmany,
r, cr, comm, R2HC,
MPI_FLAGS(flags)));
else
return X(mkapiplan)(0, flags,
XM(mkproblem_rdft2_d)(sz, howmany,
cr, r, comm, HC2R,
MPI_FLAGS(flags)));
}
X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n,
ptrdiff_t howmany,
ptrdiff_t iblock, ptrdiff_t oblock,
R *in, C *out,
MPI_Comm comm, unsigned flags)
{
XM(ddim) *dims = simple_dims(rnk, n);
X(plan) pln;
if (rnk == 1) {
dims[0].ib = iblock;
dims[0].ob = oblock;
}
else if (rnk > 1) {
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
}
pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags);
X(ifree)(dims);
return pln;
}
X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n,
ptrdiff_t howmany,
ptrdiff_t iblock, ptrdiff_t oblock,
C *in, R *out,
MPI_Comm comm, unsigned flags)
{
XM(ddim) *dims = simple_dims(rnk, n);
X(plan) pln;
if (rnk == 1) {
dims[0].ib = iblock;
dims[0].ob = oblock;
}
else if (rnk > 1) {
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
}
pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags);
X(ifree)(dims);
return pln;
}
X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out,
MPI_Comm comm, unsigned flags)
{
return XM(plan_many_dft_r2c)(rnk, n, 1,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
in, out, comm, flags);
}
X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out,
MPI_Comm comm, unsigned flags)
{
ptrdiff_t n[2];
n[0] = nx; n[1] = ny;
return XM(plan_dft_r2c)(2, n, in, out, comm, flags);
}
X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
R *in, C *out, MPI_Comm comm, unsigned flags)
{
ptrdiff_t n[3];
n[0] = nx; n[1] = ny; n[2] = nz;
return XM(plan_dft_r2c)(3, n, in, out, comm, flags);
}
X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out,
MPI_Comm comm, unsigned flags)
{
return XM(plan_many_dft_c2r)(rnk, n, 1,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
in, out, comm, flags);
}
X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out,
MPI_Comm comm, unsigned flags)
{
ptrdiff_t n[2];
n[0] = nx; n[1] = ny;
return XM(plan_dft_c2r)(2, n, in, out, comm, flags);
}
X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
C *in, R *out, MPI_Comm comm, unsigned flags)
{
ptrdiff_t n[3];
n[0] = nx; n[1] = ny; n[2] = nz;
return XM(plan_dft_c2r)(3, n, in, out, comm, flags);
}
/*************************************************************************/
/* New-array execute functions */
void XM(execute_dft)(const X(plan) p, C *in, C *out) {
/* internally, MPI plans are just rdft plans */
X(execute_r2r)(p, (R*) in, (R*) out);
}
void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) {
/* internally, MPI plans are just rdft plans */
X(execute_r2r)(p, in, (R*) out);
}
void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) {
/* internally, MPI plans are just rdft plans */
X(execute_r2r)(p, (R*) in, out);
}
void XM(execute_r2r)(const X(plan) p, R *in, R *out) {
/* internally, MPI plans are just rdft plans */
X(execute_r2r)(p, in, out);
}

131
fftw-3.3.10/mpi/block.c Normal file
View File

@@ -0,0 +1,131 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
INT XM(num_blocks)(INT n, INT block)
{
return (n + block - 1) / block;
}
int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm)
{
int n_pes;
MPI_Comm_size(comm, &n_pes);
return n_pes >= XM(num_blocks)(n, block);
}
/* Pick a default block size for dividing a problem of size n among
n_pes processes. Divide as equally as possible, while minimizing
the maximum block size among the processes as well as the number of
processes with nonzero blocks. */
INT XM(default_block)(INT n, int n_pes)
{
return ((n + n_pes - 1) / n_pes);
}
/* For a given block size and dimension n, compute the block size
on the given process. */
INT XM(block)(INT n, INT block, int which_block)
{
INT d = n - which_block * block;
return d <= 0 ? 0 : (d > block ? block : d);
}
static INT num_blocks_kind(const ddim *dim, block_kind k)
{
return XM(num_blocks)(dim->n, dim->b[k]);
}
INT XM(num_blocks_total)(const dtensor *sz, block_kind k)
{
if (FINITE_RNK(sz->rnk)) {
int i;
INT ntot = 1;
for (i = 0; i < sz->rnk; ++i)
ntot *= num_blocks_kind(sz->dims + i, k);
return ntot;
}
else
return 0;
}
int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe)
{
return (which_pe >= XM(num_blocks_total)(sz, k));
}
/* Given a non-idle process which_pe, computes the coordinate
vector coords[rnk] giving the coordinates of a block in the
matrix of blocks. k specifies whether we are talking about
the input or output data distribution. */
void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe,
INT *coords)
{
int i;
A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk));
for (i = sz->rnk - 1; i >= 0; --i) {
INT nb = num_blocks_kind(sz->dims + i, k);
coords[i] = which_pe % nb;
which_pe /= nb;
}
}
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe)
{
if (XM(idle_process)(sz, k, which_pe))
return 0;
else {
int i;
INT N = 1, *coords;
STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk);
XM(block_coords)(sz, k, which_pe, coords);
for (i = 0; i < sz->rnk; ++i)
N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]);
STACK_FREE(coords);
return N;
}
}
/* returns whether sz is local for dims >= dim */
int XM(is_local_after)(int dim, const dtensor *sz, block_kind k)
{
if (FINITE_RNK(sz->rnk))
for (; dim < sz->rnk; ++dim)
if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1)
return 0;
return 1;
}
int XM(is_local)(const dtensor *sz, block_kind k)
{
return XM(is_local_after)(0, sz, k);
}
/* Return whether sz is distributed for k according to a simple
1d block distribution in the first or second dimensions */
int XM(is_block1d)(const dtensor *sz, block_kind k)
{
int i;
if (!FINITE_RNK(sz->rnk)) return 0;
for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ;
return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k));
}

View File

@@ -0,0 +1,83 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* Return the radix r for a 1d MPI transform of a distributed dimension d,
with the given flags and transform size. That is, decomposes d.n
as r * m, Cooley-Tukey style. Also computes the block sizes rblock
and mblock. Returns 0 if such a decomposition is not feasible.
This is unfortunately somewhat complicated.
A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
d.n is initially distributed as an m x r array with block size mblock[IB].
Then it is internally transposed to an r x m array with block size
rblock[IB]. Then it is internally transposed to m x r again with block
size mblock[OB]. Finally, it is transposed to r x m with block size
rblock[IB].
If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose
is skipped (the array ends up as m x r). To make sure the forward
and backward transforms use the same "scrambling" format, we swap r
and m when sign != FFT_SIGN.
There are some downsides to this, especially in the case where
either m or r is not divisible by n_pes. For one thing, it means
that in general we can't use the same block size for the input and
output. For another thing, it means that we can't in general honor
a user's "requested" block sizes in d.b[]. Therefore, for simplicity,
we simply ignore d.b[] for now.
*/
INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
INT rblock[2], INT mblock[2])
{
INT r, m;
UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
/* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
This not only ensures that the input (the m dimension) is
equally distributed if possible, and at the r dimension is
maximally equally distributed (if d.n/n_pes >= n_pes), it also
makes one of the local transpositions in the algorithm
trivial. */
if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
&& d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
r = d.n / n_pes;
else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
;
}
if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
m = r;
r = d.n / m;
}
else
m = d.n / r;
rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
return r;
}

50
fftw-3.3.10/mpi/conf.c Normal file
View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-transpose.h"
#include "mpi-dft.h"
#include "mpi-rdft.h"
#include "mpi-rdft2.h"
static const solvtab s =
{
SOLVTAB(XM(transpose_pairwise_register)),
SOLVTAB(XM(transpose_alltoall_register)),
SOLVTAB(XM(transpose_recurse_register)),
SOLVTAB(XM(dft_rank_geq2_register)),
SOLVTAB(XM(dft_rank_geq2_transposed_register)),
SOLVTAB(XM(dft_serial_register)),
SOLVTAB(XM(dft_rank1_bigvec_register)),
SOLVTAB(XM(dft_rank1_register)),
SOLVTAB(XM(rdft_rank_geq2_register)),
SOLVTAB(XM(rdft_rank_geq2_transposed_register)),
SOLVTAB(XM(rdft_serial_register)),
SOLVTAB(XM(rdft_rank1_bigvec_register)),
SOLVTAB(XM(rdft2_rank_geq2_register)),
SOLVTAB(XM(rdft2_rank_geq2_transposed_register)),
SOLVTAB(XM(rdft2_serial_register)),
SOLVTAB_END
};
void XM(conf_standard)(planner *p)
{
X(solvtab_exec)(s, p);
}

View File

@@ -0,0 +1,136 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-dft.h"
static void destroy(problem *ego_)
{
problem_mpi_dft *ego = (problem_mpi_dft *) ego_;
XM(dtensor_destroy)(ego->sz);
MPI_Comm_free(&ego->comm);
X(ifree)(ego_);
}
static void hash(const problem *p_, md5 *m)
{
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
int i;
X(md5puts)(m, "mpi-dft");
X(md5int)(m, p->I == p->O);
/* don't include alignment -- may differ between processes
X(md5int)(m, X(ialignment_of)(p->I));
X(md5int)(m, X(ialignment_of)(p->O));
... note that applicability of MPI plans does not depend
on alignment (although optimality may, in principle). */
XM(dtensor_md5)(m, p->sz);
X(md5INT)(m, p->vn);
X(md5int)(m, p->sign);
X(md5int)(m, p->flags);
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
A(XM(md5_equal)(*m, p->comm));
}
static void print(const problem *ego_, printer *p)
{
const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
int i;
p->print(p, "(mpi-dft %d %d %d ",
ego->I == ego->O,
X(ialignment_of)(ego->I),
X(ialignment_of)(ego->O));
XM(dtensor_print)(ego->sz, p);
p->print(p, " %D %d %d", ego->vn, ego->sign, ego->flags);
MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
}
static void zero(const problem *ego_)
{
const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
R *I = ego->I;
INT i, N;
int my_pe;
MPI_Comm_rank(ego->comm, &my_pe);
N = 2 * ego->vn * XM(total_block)(ego->sz, IB, my_pe);
for (i = 0; i < N; ++i) I[i] = K(0.0);
}
static const problem_adt padt =
{
PROBLEM_MPI_DFT,
hash,
zero,
print,
destroy
};
problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
R *I, R *O,
MPI_Comm comm,
int sign,
unsigned flags)
{
problem_mpi_dft *ego =
(problem_mpi_dft *)X(mkproblem)(sizeof(problem_mpi_dft), &padt);
int n_pes;
A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
MPI_Comm_size(comm, &n_pes);
A(n_pes >= XM(num_blocks_total)(sz, IB)
&& n_pes >= XM(num_blocks_total)(sz, OB));
A(vn >= 0);
A(sign == -1 || sign == 1);
/* enforce pointer equality if untainted pointers are equal */
if (UNTAINT(I) == UNTAINT(O))
I = O = JOIN_TAINT(I, O);
ego->sz = XM(dtensor_canonical)(sz, 1);
ego->vn = vn;
ego->I = I;
ego->O = O;
ego->sign = sign;
/* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
swapping the first two dimensions (for rnk > 1) */
if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
ddim dim0 = ego->sz->dims[0];
ego->sz->dims[0] = ego->sz->dims[1];
ego->sz->dims[1] = dim0;
flags &= ~TRANSPOSED_IN;
flags ^= TRANSPOSED_OUT;
}
ego->flags = flags;
MPI_Comm_dup(comm, &ego->comm);
return &(ego->super);
}
problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
R *I, R *O,
MPI_Comm comm,
int sign,
unsigned flags)
{
problem *p = XM(mkproblem_dft)(sz, vn, I, O, comm, sign, flags);
XM(dtensor_destroy)(sz);
return p;
}

View File

@@ -0,0 +1,221 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex DFTs of rank >= 2, for the case where we are distributed
across the first dimension only, and the output is transposed both
in data distribution and in ordering (for the first 2 dimensions).
(Note that we don't have to handle the case where the input is
transposed, since this is equivalent to transposed output with the
first two dimensions swapped, and is automatically canonicalized as
such by dft-problem.c. */
#include "mpi-dft.h"
#include "mpi-transpose.h"
#include "dft/dft.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_dft super;
plan *cld1, *cldt, *cld2;
INT roff, ioff;
int preserve_input;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_dft *cld1, *cld2;
plan_rdft *cldt;
INT roff = ego->roff, ioff = ego->ioff;
/* DFT local dimensions */
cld1 = (plan_dft *) ego->cld1;
if (ego->preserve_input) {
cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
I = O;
}
else
cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
/* global transpose */
cldt = (plan_rdft *) ego->cldt;
cldt->apply(ego->cldt, I, O);
/* DFT final local dimension */
cld2 = (plan_dft *) ego->cld2;
cld2->apply(ego->cld2, O+roff, O+ioff, O+roff, O+ioff);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
return (1
&& p->sz->rnk > 1
&& p->flags == TRANSPOSED_OUT
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& XM(is_local_after)(1, p->sz, IB)
&& XM(is_local_after)(2, p->sz, OB)
&& XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|| !XM(dft_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cldt, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cldt);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-dft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"",
ego->cld1, ego->cldt, ego->cld2);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_dft *p;
P *pln;
plan *cld1 = 0, *cldt = 0, *cld2 = 0;
R *ri, *ii, *ro, *io, *I, *O;
tensor *sz;
int i, my_pe, n_pes;
INT nrest;
static const plan_adt padt = {
XM(dft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_dft *) p_;
X(extract_reim)(p->sign, I = p->I, &ri, &ii);
X(extract_reim)(p->sign, O = p->O, &ro, &io);
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
I = O;
else {
ro = ri;
io = ii;
}
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
i = p->sz->rnk - 2; A(i >= 0);
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
for (--i; i >= 0; --i) {
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
}
nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
{
INT is = sz->dims[0].n * sz->dims[0].is;
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
cld1 = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(sz,
X(mktensor_2d)(b, is, is,
p->vn, 2, 2),
ri, ii, ro, io));
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
nrest *= p->vn;
cldt = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
p->sz->dims[0].n, p->sz->dims[1].n, nrest * 2,
I, O,
p->sz->dims[0].b[IB], p->sz->dims[1].b[OB],
p->comm, 0));
if (XM(any_true)(!cldt, p->comm)) goto nada;
X(extract_reim)(p->sign, O, &ro, &io);
{
INT is = p->sz->dims[0].n * nrest * 2;
INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
cld2 = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(
p->sz->dims[0].n,
nrest * 2, nrest * 2),
X(mktensor_2d)(b, is, is,
nrest, 2, 2),
ro, io, ro, io));
if (XM(any_true)(!cld2, p->comm)) goto nada;
}
pln = MKPLAN_MPI_DFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cldt = cldt;
pln->cld2 = cld2;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->roff = ri - p->I;
pln->ioff = ii - p->I;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
X(ops_add2)(&cldt->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cldt);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(dft_rank_geq2_transposed_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,188 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex DFTs of rank >= 2, for the case where we are distributed
across the first dimension only, and the output is not transposed. */
#include "mpi-dft.h"
#include "dft/dft.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_dft super;
plan *cld1, *cld2;
INT roff, ioff;
int preserve_input;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_dft *cld1;
plan_rdft *cld2;
INT roff = ego->roff, ioff = ego->ioff;
/* DFT local dimensions */
cld1 = (plan_dft *) ego->cld1;
if (ego->preserve_input) {
cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
I = O;
}
else
cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
/* DFT non-local dimension (via dft-rank1-bigvec, usually): */
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, I, O);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
return (1
&& p->sz->rnk > 1
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& XM(is_local_after)(1, p->sz, IB)
&& XM(is_local_after)(1, p->sz, OB)
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|| !XM(dft_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-dft-rank-geq2%s%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_dft *p;
P *pln;
plan *cld1 = 0, *cld2 = 0;
R *ri, *ii, *ro, *io, *I, *O;
tensor *sz;
dtensor *sz2;
int i, my_pe, n_pes;
INT nrest;
static const plan_adt padt = {
XM(dft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_dft *) p_;
X(extract_reim)(p->sign, I = p->I, &ri, &ii);
X(extract_reim)(p->sign, O = p->O, &ro, &io);
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
I = O;
else {
ro = ri;
io = ii;
}
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
i = p->sz->rnk - 2; A(i >= 0);
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
for (--i; i >= 0; --i) {
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
}
nrest = X(tensor_sz)(sz);
{
INT is = sz->dims[0].n * sz->dims[0].is;
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
cld1 = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(sz,
X(mktensor_2d)(b, is, is,
p->vn, 2, 2),
ri, ii, ro, io));
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
sz2->dims[0] = p->sz->dims[0];
cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
I, O, p->comm, p->sign,
RANK1_BIGVEC_ONLY));
if (XM(any_true)(!cld2, p->comm)) goto nada;
pln = MKPLAN_MPI_DFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->roff = ri - p->I;
pln->ioff = ii - p->I;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(dft_rank_geq2_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,211 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex DFTs of rank == 1 when the vector length vn is >= # processes.
In this case, we don't need to use a six-step type algorithm, and can
instead transpose the DFT dimension with the vector dimension to
make the DFT local. */
#include "mpi-dft.h"
#include "mpi-transpose.h"
#include "dft/dft.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
rearrangement rearrange;
} S;
typedef struct {
plan_mpi_dft super;
plan *cldt_before, *cld, *cldt_after;
INT roff, ioff;
int preserve_input;
rearrangement rearrange;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
plan_rdft *cldt_before, *cldt_after;
INT roff = ego->roff, ioff = ego->ioff;
/* global transpose */
cldt_before = (plan_rdft *) ego->cldt_before;
cldt_before->apply(ego->cldt_before, I, O);
if (ego->preserve_input) I = O;
/* 1d DFT(s) */
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, O+roff, O+ioff, I+roff, I+ioff);
/* global transpose */
cldt_after = (plan_rdft *) ego->cldt_after;
cldt_after->apply(ego->cldt_after, I, O);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
int n_pes;
MPI_Comm_size(p->comm, &n_pes);
return (1
&& p->sz->rnk == 1
&& !(p->flags & ~RANK1_BIGVEC_ONLY)
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& (p->vn >= n_pes /* TODO: relax this, using more memory? */
|| (p->flags & RANK1_BIGVEC_ONLY))
&& XM(rearrange_applicable)(ego->rearrange,
p->sz->dims[0], p->vn, n_pes)
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|| !XM(dft_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldt_before, wakefulness);
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldt_after, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldt_after);
X(plan_destroy_internal)(ego->cld);
X(plan_destroy_internal)(ego->cldt_before);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const char descrip[][16] = { "contig", "discontig", "square-after",
"square-middle", "square-before" };
p->print(p, "(mpi-dft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
ego->cldt_before, ego->cld, ego->cldt_after);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_dft *p;
P *pln;
plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
R *ri, *ii, *ro, *io, *I, *O;
INT yblock, yb, nx, ny, vn;
int my_pe, n_pes;
static const plan_adt padt = {
XM(dft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_dft *) p_;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
nx = p->sz->dims[0].n;
if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
return (plan *) 0;
vn = p->vn / ny;
A(ny * vn == p->vn);
yblock = XM(default_block)(ny, n_pes);
cldt_before = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
nx, ny, vn*2,
I = p->I, O = p->O,
p->sz->dims[0].b[IB], yblock,
p->comm, 0));
if (XM(any_true)(!cldt_before, p->comm)) goto nada;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
X(extract_reim)(p->sign, I, &ri, &ii);
X(extract_reim)(p->sign, O, &ro, &io);
yb = XM(block)(ny, yblock, my_pe);
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(nx, vn*2, vn*2),
X(mktensor_2d)(yb, vn*2*nx, vn*2*nx,
vn, 2, 2),
ro, io, ri, ii));
if (XM(any_true)(!cld, p->comm)) goto nada;
cldt_after = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
ny, nx, vn*2,
I, O,
yblock, p->sz->dims[0].b[OB],
p->comm, 0));
if (XM(any_true)(!cldt_after, p->comm)) goto nada;
pln = MKPLAN_MPI_DFT(P, &padt, apply);
pln->cldt_before = cldt_before;
pln->cld = cld;
pln->cldt_after = cldt_after;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->roff = ro - p->O;
pln->ioff = io - p->O;
pln->rearrange = ego->rearrange;
X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldt_after);
X(plan_destroy_internal)(cld);
X(plan_destroy_internal)(cldt_before);
return (plan *) 0;
}
static solver *mksolver(rearrangement rearrange, int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->rearrange = rearrange;
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(dft_rank1_bigvec_register)(planner *p)
{
rearrangement rearrange;
int preserve_input;
FORALL_REARRANGE(rearrange)
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
}

352
fftw-3.3.10/mpi/dft-rank1.c Normal file
View File

@@ -0,0 +1,352 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex DFTs of rank == 1 via six-step algorithm. */
#include "mpi-dft.h"
#include "mpi-transpose.h"
#include "dft/dft.h"
typedef struct {
solver super;
rdftapply apply; /* apply_ddft_first or apply_ddft_last */
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_dft super;
triggen *t;
plan *cldt, *cld_ddft, *cld_dft;
INT roff, ioff;
int preserve_input;
INT vn, xmin, xmax, xs, m, r;
} P;
static void do_twiddle(triggen *t, INT ir, INT m, INT vn, R *xr, R *xi)
{
void (*rotate)(triggen *, INT, R, R, R *) = t->rotate;
INT im, iv;
for (im = 0; im < m; ++im)
for (iv = 0; iv < vn; ++iv) {
/* TODO: modify/inline rotate function
so that it can do whole vn vector at once? */
R c[2];
rotate(t, ir * im, *xr, *xi, c);
*xr = c[0]; *xi = c[1];
xr += 2; xi += 2;
}
}
/* radix-r DFT of size r*m. This is equivalent to an m x r 2d DFT,
plus twiddle factors between the size-m and size-r 1d DFTs, where
the m dimension is initially distributed. The output is transposed
to r x m where the r dimension is distributed.
This algorithm follows the general sequence:
global transpose (m x r -> r x m)
DFTs of size m
multiply by twiddles + global transpose (r x m -> m x r)
DFTs of size r
global transpose (m x r -> r x m)
where the multiplication by twiddles can come before or after
the middle transpose. The first/last transposes are omitted
for SCRAMBLED_IN/OUT formats, respectively.
However, we wish to exploit our dft-rank1-bigvec solver, which
solves a vector of distributed DFTs via transpose+dft+transpose.
Therefore, we can group *either* the DFTs of size m *or* the
DFTs of size r with their surrounding transposes as a single
distributed-DFT (ddft) plan. These two variations correspond to
apply_ddft_first or apply_ddft_last, respectively.
*/
static void apply_ddft_first(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_dft *cld_dft;
plan_rdft *cldt, *cld_ddft;
INT roff, ioff, im, mmax, ms, r, vn;
triggen *t;
R *dI, *dO;
/* distributed size-m DFTs, with output in m x r format */
cld_ddft = (plan_rdft *) ego->cld_ddft;
cld_ddft->apply(ego->cld_ddft, I, O);
cldt = (plan_rdft *) ego->cldt;
if (ego->preserve_input || !cldt) I = O;
/* twiddle multiplications, followed by 1d DFTs of size-r */
cld_dft = (plan_dft *) ego->cld_dft;
roff = ego->roff; ioff = ego->ioff;
mmax = ego->xmax; ms = ego->xs;
t = ego->t; r = ego->r; vn = ego->vn;
dI = O; dO = I;
for (im = ego->xmin; im <= mmax; ++im) {
do_twiddle(t, im, r, vn, dI+roff, dI+ioff);
cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
dI += ms; dO += ms;
}
/* final global transpose (m x r -> r x m), if not SCRAMBLED_OUT */
if (cldt)
cldt->apply((plan *) cldt, I, O);
}
static void apply_ddft_last(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_dft *cld_dft;
plan_rdft *cldt, *cld_ddft;
INT roff, ioff, ir, rmax, rs, m, vn;
triggen *t;
R *dI, *dO0, *dO;
/* initial global transpose (m x r -> r x m), if not SCRAMBLED_IN */
cldt = (plan_rdft *) ego->cldt;
if (cldt) {
cldt->apply((plan *) cldt, I, O);
dI = O;
}
else
dI = I;
if (ego->preserve_input) dO = O; else dO = I;
dO0 = dO;
/* 1d DFTs of size m, followed by twiddle multiplications */
cld_dft = (plan_dft *) ego->cld_dft;
roff = ego->roff; ioff = ego->ioff;
rmax = ego->xmax; rs = ego->xs;
t = ego->t; m = ego->m; vn = ego->vn;
for (ir = ego->xmin; ir <= rmax; ++ir) {
cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
do_twiddle(t, ir, m, vn, dO+roff, dO+ioff);
dI += rs; dO += rs;
}
/* distributed size-r DFTs, with output in r x m format */
cld_ddft = (plan_rdft *) ego->cld_ddft;
cld_ddft->apply(ego->cld_ddft, dO0, O);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr,
INT *r, INT rblock[2], INT mblock[2])
{
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
int n_pes;
MPI_Comm_size(p->comm, &n_pes);
return (1
&& p->sz->rnk == 1
&& ONLY_SCRAMBLEDP(p->flags)
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& (!(p->flags & SCRAMBLED_IN) || ego->apply == apply_ddft_last)
&& (!(p->flags & SCRAMBLED_OUT) || ego->apply == apply_ddft_first)
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|| !XM(dft_serial_applicable)(p))
/* disallow if dft-rank1-bigvec is applicable since the
data distribution may be slightly different (ugh!) */
&& (p->vn < n_pes || p->flags)
&& (*r = XM(choose_radix)(p->sz->dims[0], n_pes,
p->flags, p->sign,
rblock, mblock))
/* ddft_first or last has substantial advantages in the
bigvec transpositions for the common case where
n_pes == n/r or r, respectively */
&& (!NO_UGLYP(plnr)
|| !(*r == n_pes && ego->apply == apply_ddft_first)
|| !(p->sz->dims[0].n / *r == n_pes
&& ego->apply == apply_ddft_last))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldt, wakefulness);
X(plan_awake)(ego->cld_dft, wakefulness);
X(plan_awake)(ego->cld_ddft, wakefulness);
switch (wakefulness) {
case SLEEPY:
X(triggen_destroy)(ego->t); ego->t = 0;
break;
default:
ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
break;
}
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldt);
X(plan_destroy_internal)(ego->cld_dft);
X(plan_destroy_internal)(ego->cld_ddft);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-dft-rank1/%D%s%s%(%p%)%(%p%)%(%p%))",
ego->r,
ego->super.apply == apply_ddft_first ? "/first" : "/last",
ego->preserve_input==2 ?"/p":"",
ego->cld_ddft, ego->cld_dft, ego->cldt);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_dft *p;
P *pln;
plan *cld_dft = 0, *cld_ddft = 0, *cldt = 0;
R *ri, *ii, *ro, *io, *I, *O;
INT r, rblock[2], m, mblock[2], rp, mp, mpblock[2], mpb;
int my_pe, n_pes, preserve_input, ddft_first;
dtensor *sz;
static const plan_adt padt = {
XM(dft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr, &r, rblock, mblock))
return (plan *) 0;
p = (const problem_mpi_dft *) p_;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
m = p->sz->dims[0].n / r;
/* some hackery so that we can plan both ddft_first and ddft_last
as if they were ddft_first */
if ((ddft_first = (ego->apply == apply_ddft_first))) {
rp = r; mp = m;
mpblock[IB] = mblock[IB]; mpblock[OB] = mblock[OB];
mpb = XM(block)(mp, mpblock[OB], my_pe);
}
else {
rp = m; mp = r;
mpblock[IB] = rblock[IB]; mpblock[OB] = rblock[OB];
mpb = XM(block)(mp, mpblock[IB], my_pe);
}
preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
sz = XM(mkdtensor)(1);
sz->dims[0].n = mp;
sz->dims[0].b[IB] = mpblock[IB];
sz->dims[0].b[OB] = mpblock[OB];
I = (ddft_first || !preserve_input) ? p->I : p->O;
O = p->O;
cld_ddft = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz, rp * p->vn,
I, O, p->comm, p->sign,
RANK1_BIGVEC_ONLY));
if (XM(any_true)(!cld_ddft, p->comm)) goto nada;
I = TAINT((ddft_first || !p->flags) ? p->O : p->I, rp * p->vn * 2);
O = TAINT((preserve_input || (ddft_first && p->flags)) ? p->O : p->I,
rp * p->vn * 2);
X(extract_reim)(p->sign, I, &ri, &ii);
X(extract_reim)(p->sign, O, &ro, &io);
cld_dft = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(rp, p->vn*2,p->vn*2),
X(mktensor_1d)(p->vn, 2, 2),
ri, ii, ro, io));
if (XM(any_true)(!cld_dft, p->comm)) goto nada;
if (!p->flags) { /* !(SCRAMBLED_IN or SCRAMBLED_OUT) */
I = (ddft_first && preserve_input) ? p->O : p->I;
O = p->O;
cldt = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
m, r, p->vn * 2,
I, O,
ddft_first ? mblock[OB] : mblock[IB],
ddft_first ? rblock[OB] : rblock[IB],
p->comm, 0));
if (XM(any_true)(!cldt, p->comm)) goto nada;
}
pln = MKPLAN_MPI_DFT(P, &padt, ego->apply);
pln->cld_ddft = cld_ddft;
pln->cld_dft = cld_dft;
pln->cldt = cldt;
pln->preserve_input = preserve_input;
X(extract_reim)(p->sign, p->O, &ro, &io);
pln->roff = ro - p->O;
pln->ioff = io - p->O;
pln->vn = p->vn;
pln->m = m;
pln->r = r;
pln->xmin = (ddft_first ? mblock[OB] : rblock[IB]) * my_pe;
pln->xmax = pln->xmin + mpb - 1;
pln->xs = rp * p->vn * 2;
pln->t = 0;
X(ops_add)(&cld_ddft->ops, &cld_dft->ops, &pln->super.super.ops);
if (cldt) X(ops_add2)(&cldt->ops, &pln->super.super.ops);
{
double n0 = (1 + pln->xmax - pln->xmin) * (mp - 1) * pln->vn;
pln->super.super.ops.mul += 8 * n0;
pln->super.super.ops.add += 4 * n0;
pln->super.super.ops.other += 8 * n0;
}
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldt);
X(plan_destroy_internal)(cld_dft);
X(plan_destroy_internal)(cld_ddft);
return (plan *) 0;
}
static solver *mksolver(rdftapply apply, int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->apply = apply;
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(dft_rank1_register)(planner *p)
{
rdftapply apply[] = { apply_ddft_first, apply_ddft_last };
unsigned int iapply;
int preserve_input;
for (iapply = 0; iapply < sizeof(apply) / sizeof(apply[0]); ++iapply)
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(apply[iapply], preserve_input));
}

View File

@@ -0,0 +1,130 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* "MPI" DFTs where all of the data is on one processor...just
call through to serial API. */
#include "mpi-dft.h"
#include "dft/dft.h"
typedef struct {
plan_mpi_dft super;
plan *cld;
INT roff, ioff;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_dft *cld;
INT roff = ego->roff, ioff = ego->ioff;
cld = (plan_dft *) ego->cld;
cld->apply(ego->cld, I+roff, I+ioff, O+roff, O+ioff);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-dft-serial %(%p%))", ego->cld);
}
int XM(dft_serial_applicable)(const problem_mpi_dft *p)
{
return (1
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
&& ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
|| p->vn == 0));
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
P *pln;
plan *cld;
int my_pe;
R *ri, *ii, *ro, *io;
static const plan_adt padt = {
XM(dft_solve), awake, print, destroy
};
UNUSED(ego);
/* check whether applicable: */
if (!XM(dft_serial_applicable)(p))
return (plan *) 0;
X(extract_reim)(p->sign, p->I, &ri, &ii);
X(extract_reim)(p->sign, p->O, &ro, &io);
MPI_Comm_rank(p->comm, &my_pe);
if (my_pe == 0 && p->vn > 0) {
int i, rnk = p->sz->rnk;
tensor *sz = X(mktensor)(p->sz->rnk);
sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
for (i = rnk - 1; i > 0; --i) {
sz->dims[i - 1].is = sz->dims[i - 1].os =
sz->dims[i].is * sz->dims[i].n;
sz->dims[i - 1].n = p->sz->dims[i - 1].n;
}
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(sz,
X(mktensor_1d)(p->vn, 2, 2),
ri, ii, ro, io));
}
else { /* idle process: make nop plan */
cld = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_0d)(),
X(mktensor_1d)(0,0,0),
ri, ii, ro, io));
}
if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
pln = MKPLAN_MPI_DFT(P, &padt, apply);
pln->cld = cld;
pln->roff = ro - p->O;
pln->ioff = io - p->O;
X(ops_cpy)(&cld->ops, &pln->super.super.ops);
return &(pln->super.super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
return MKSOLVER(solver, &sadt);
}
void XM(dft_serial_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-dft.h"
/* use the apply() operation for MPI_DFT problems */
void XM(dft_solve)(const plan *ego_, const problem *p_)
{
const plan_mpi_dft *ego = (const plan_mpi_dft *) ego_;
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
}

146
fftw-3.3.10/mpi/dtensor.c Normal file
View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
dtensor *XM(mkdtensor)(int rnk)
{
dtensor *x;
A(rnk >= 0);
#if defined(STRUCT_HACK_KR)
if (FINITE_RNK(rnk) && rnk > 1)
x = (dtensor *)MALLOC(sizeof(dtensor) + (rnk - 1) * sizeof(ddim),
TENSORS);
else
x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
#elif defined(STRUCT_HACK_C99)
if (FINITE_RNK(rnk))
x = (dtensor *)MALLOC(sizeof(dtensor) + rnk * sizeof(ddim),
TENSORS);
else
x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
#else
x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
if (FINITE_RNK(rnk) && rnk > 0)
x->dims = (ddim *)MALLOC(sizeof(ddim) * rnk, TENSORS);
else
x->dims = 0;
#endif
x->rnk = rnk;
return x;
}
void XM(dtensor_destroy)(dtensor *sz)
{
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
X(ifree0)(sz->dims);
#endif
X(ifree)(sz);
}
void XM(dtensor_md5)(md5 *p, const dtensor *t)
{
int i;
X(md5int)(p, t->rnk);
if (FINITE_RNK(t->rnk)) {
for (i = 0; i < t->rnk; ++i) {
const ddim *q = t->dims + i;
X(md5INT)(p, q->n);
X(md5INT)(p, q->b[IB]);
X(md5INT)(p, q->b[OB]);
}
}
}
dtensor *XM(dtensor_copy)(const dtensor *sz)
{
dtensor *x = XM(mkdtensor)(sz->rnk);
int i;
if (FINITE_RNK(sz->rnk))
for (i = 0; i < sz->rnk; ++i)
x->dims[i] = sz->dims[i];
return x;
}
dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress)
{
int i, rnk;
dtensor *x;
block_kind k;
if (!FINITE_RNK(sz->rnk))
return XM(mkdtensor)(sz->rnk);
for (i = rnk = 0; i < sz->rnk; ++i) {
if (sz->dims[i].n <= 0)
return XM(mkdtensor)(RNK_MINFTY);
else if (!compress || sz->dims[i].n > 1)
++rnk;
}
x = XM(mkdtensor)(rnk);
for (i = rnk = 0; i < sz->rnk; ++i) {
if (!compress || sz->dims[i].n > 1) {
x->dims[rnk].n = sz->dims[i].n;
FORALL_BLOCK_KIND(k) {
if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]) == 1)
x->dims[rnk].b[k] = sz->dims[i].n;
else
x->dims[rnk].b[k] = sz->dims[i].b[k];
}
++rnk;
}
}
return x;
}
int XM(dtensor_validp)(const dtensor *sz)
{
int i;
if (sz->rnk < 0) return 0;
if (FINITE_RNK(sz->rnk))
for (i = 0; i < sz->rnk; ++i)
if (sz->dims[i].n < 0
|| sz->dims[i].b[IB] <= 0
|| sz->dims[i].b[OB] <= 0)
return 0;
return 1;
}
void XM(dtensor_print)(const dtensor *t, printer *p)
{
if (FINITE_RNK(t->rnk)) {
int i;
int first = 1;
p->print(p, "(");
for (i = 0; i < t->rnk; ++i) {
const ddim *d = t->dims + i;
p->print(p, "%s(%D %D %D)",
first ? "" : " ",
d->n, d->b[IB], d->b[OB]);
first = 0;
}
p->print(p, ")");
} else {
p->print(p, "rank-minfty");
}
}

284
fftw-3.3.10/mpi/f03-wrap.c Normal file
View File

@@ -0,0 +1,284 @@
/* Generated automatically. DO NOT EDIT! */
#include "fftw3-mpi.h"
#include "ifftw-mpi.h"
FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
FFTW_EXTERN ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
FFTW_EXTERN ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
FFTW_EXTERN ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
FFTW_EXTERN ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
FFTW_EXTERN ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
FFTW_EXTERN ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
FFTW_EXTERN ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
FFTW_EXTERN X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
FFTW_EXTERN void XM(gather_wisdom_f03)(MPI_Fint f_comm_);
FFTW_EXTERN void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_);
ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_many_transposed)(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start,local_n1,local_1_start);
}
ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_many)(rnk,n,howmany,block0,comm,local_n0,local_0_start);
}
ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_transposed)(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start);
}
ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size)(rnk,n,comm,local_n0,local_0_start);
}
ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_many_1d)(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
}
ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_1d)(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
}
ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_2d)(n0,n1,comm,local_n0,local_0_start);
}
ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_2d_transposed)(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start);
}
ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_3d)(n0,n1,n2,comm,local_n0,local_0_start);
}
ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(local_size_3d_transposed)(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start);
}
X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_many_transpose)(n0,n1,howmany,block0,block1,in,out,comm,flags);
}
X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_transpose)(n0,n1,in,out,comm,flags);
}
X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_many_dft)(rnk,n,howmany,block,tblock,in,out,comm,sign,flags);
}
X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft)(rnk,n,in,out,comm,sign,flags);
}
X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_1d)(n0,in,out,comm,sign,flags);
}
X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_2d)(n0,n1,in,out,comm,sign,flags);
}
X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_3d)(n0,n1,n2,in,out,comm,sign,flags);
}
X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_many_r2r)(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags);
}
X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_r2r)(rnk,n,in,out,comm,kind,flags);
}
X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_r2r_2d)(n0,n1,in,out,comm,kind0,kind1,flags);
}
X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_r2r_3d)(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags);
}
X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_many_dft_r2c)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
}
X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_r2c)(rnk,n,in,out,comm,flags);
}
X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_r2c_2d)(n0,n1,in,out,comm,flags);
}
X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_r2c_3d)(n0,n1,n2,in,out,comm,flags);
}
X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_many_dft_c2r)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
}
X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_c2r)(rnk,n,in,out,comm,flags);
}
X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_c2r_2d)(n0,n1,in,out,comm,flags);
}
X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
{
MPI_Comm comm;
comm = MPI_Comm_f2c(f_comm);
return XM(plan_dft_c2r_3d)(n0,n1,n2,in,out,comm,flags);
}
void XM(gather_wisdom_f03)(MPI_Fint f_comm_)
{
MPI_Comm comm_;
comm_ = MPI_Comm_f2c(f_comm_);
XM(gather_wisdom)(comm_);
}
void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_)
{
MPI_Comm comm_;
comm_ = MPI_Comm_f2c(f_comm_);
XM(broadcast_wisdom)(comm_);
}

22
fftw-3.3.10/mpi/f03-wrap.sh Executable file
View File

@@ -0,0 +1,22 @@
#! /bin/sh
# Script to generate Fortran 2003 wrappers for FFTW's MPI functions. This
# is necessary because MPI provides no way to deal with C MPI_Comm handles
# from Fortran (where MPI_Comm == integer), but does provide a way to
# deal with Fortran MPI_Comm handles from C (via MPI_Comm_f2c). So,
# every FFTW function that takes an MPI_Comm argument needs a wrapper
# function that takes a Fortran integer and converts it to MPI_Comm.
echo "/* Generated automatically. DO NOT EDIT! */"
echo
echo "#include \"fftw3-mpi.h\""
echo "#include \"ifftw-mpi.h\""
echo
# Declare prototypes using FFTW_EXTERN, important for Windows DLLs
grep -v 'mpi.h' fftw3-mpi.h | gcc -E -I../api - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl | grep "MPI_Fint" | sed 's/^/FFTW_EXTERN /;s/$/;/'
grep -v 'mpi.h' fftw3-mpi.h | gcc -E -I../api - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl

43
fftw-3.3.10/mpi/f03api.sh Executable file
View File

@@ -0,0 +1,43 @@
#! /bin/sh
# Script to generate Fortran 2003 interface declarations for FFTW's MPI
# interface from the fftw3-mpi.h header file.
# This is designed so that the Fortran caller can do:
# use, intrinsic :: iso_c_binding
# implicit none
# include 'fftw3-mpi.f03'
# and then call the C FFTW MPI functions directly, with type checking.
#
# One caveat: because there is no standard way to conver MPI_Comm objects
# from Fortran (= integer) to C (= opaque type), the Fortran interface
# technically calls C wrapper functions (also auto-generated) which
# call MPI_Comm_f2c to convert the communicators as needed.
echo "! Generated automatically. DO NOT EDIT!"
echo
echo " include 'fftw3.f03'"
echo
# Extract constants
perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n integer\(C_INTPTR_T\), parameter :: \1 = \2\n/g' < fftw3-mpi.h | grep 'integer(C_INTPTR_T)'
perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n integer\(C_INT\), parameter :: $1 = ",$2 << $3,"\n"; }' < fftw3-mpi.h | grep 'integer(C_INT)'
# Extract function declarations
for p in $*; do
if test "$p" = "d"; then p=""; fi
echo
cat <<EOF
type, bind(C) :: fftw${p}_mpi_ddim
integer(C_INTPTR_T) n, ib, ob
end type fftw${p}_mpi_ddim
EOF
echo
echo " interface"
grep -v 'mpi.h' fftw3-mpi.h | gcc -I../api -D__GNUC__=5 -D__i386__ -E - |grep "fftw${p}_mpi_init" |tr ';' '\n' | perl ../api/genf03.pl
echo " end interface"
done

View File

@@ -0,0 +1,810 @@
! Generated automatically. DO NOT EDIT!
include 'fftw3.f03'
integer(C_INTPTR_T), parameter :: FFTW_MPI_DEFAULT_BLOCK = 0
integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_IN = 134217728
integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_OUT = 268435456
integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_IN = 536870912
integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_OUT = 1073741824
type, bind(C) :: fftw_mpi_ddim
integer(C_INTPTR_T) n, ib, ob
end type fftw_mpi_ddim
interface
subroutine fftw_mpi_init() bind(C, name='fftw_mpi_init')
import
end subroutine fftw_mpi_init
subroutine fftw_mpi_cleanup() bind(C, name='fftw_mpi_cleanup')
import
end subroutine fftw_mpi_cleanup
integer(C_INTPTR_T) function fftw_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
local_n1,local_1_start) &
bind(C, name='fftw_mpi_local_size_many_transposed_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_INTPTR_T), value :: block1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftw_mpi_local_size_many_transposed
integer(C_INTPTR_T) function fftw_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
bind(C, name='fftw_mpi_local_size_many_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftw_mpi_local_size_many
integer(C_INTPTR_T) function fftw_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftw_mpi_local_size_transposed_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftw_mpi_local_size_transposed
integer(C_INTPTR_T) function fftw_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftw_mpi_local_size_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftw_mpi_local_size
integer(C_INTPTR_T) function fftw_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
local_o_start) bind(C, name='fftw_mpi_local_size_many_1d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: howmany
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
integer(C_INTPTR_T), intent(out) :: local_ni
integer(C_INTPTR_T), intent(out) :: local_i_start
integer(C_INTPTR_T), intent(out) :: local_no
integer(C_INTPTR_T), intent(out) :: local_o_start
end function fftw_mpi_local_size_many_1d
integer(C_INTPTR_T) function fftw_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
bind(C, name='fftw_mpi_local_size_1d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
integer(C_INTPTR_T), intent(out) :: local_ni
integer(C_INTPTR_T), intent(out) :: local_i_start
integer(C_INTPTR_T), intent(out) :: local_no
integer(C_INTPTR_T), intent(out) :: local_o_start
end function fftw_mpi_local_size_1d
integer(C_INTPTR_T) function fftw_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
bind(C, name='fftw_mpi_local_size_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftw_mpi_local_size_2d
integer(C_INTPTR_T) function fftw_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftw_mpi_local_size_2d_transposed_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftw_mpi_local_size_2d_transposed
integer(C_INTPTR_T) function fftw_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
bind(C, name='fftw_mpi_local_size_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftw_mpi_local_size_3d
integer(C_INTPTR_T) function fftw_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftw_mpi_local_size_3d_transposed_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftw_mpi_local_size_3d_transposed
type(C_PTR) function fftw_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
bind(C, name='fftw_mpi_plan_many_transpose_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_INTPTR_T), value :: block1
real(C_DOUBLE), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_many_transpose
type(C_PTR) function fftw_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_transpose_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_DOUBLE), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_transpose
type(C_PTR) function fftw_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
bind(C, name='fftw_mpi_plan_many_dft_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block
integer(C_INTPTR_T), value :: tblock
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftw_mpi_plan_many_dft
type(C_PTR) function fftw_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft
type(C_PTR) function fftw_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_1d_f03')
import
integer(C_INTPTR_T), value :: n0
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_1d
type(C_PTR) function fftw_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_2d
type(C_PTR) function fftw_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_3d
type(C_PTR) function fftw_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
bind(C, name='fftw_mpi_plan_many_r2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
real(C_DOUBLE), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
integer(C_INT), value :: flags
end function fftw_mpi_plan_many_r2r
type(C_PTR) function fftw_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftw_mpi_plan_r2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
real(C_DOUBLE), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
integer(C_INT), value :: flags
end function fftw_mpi_plan_r2r
type(C_PTR) function fftw_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftw_mpi_plan_r2r_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_DOUBLE), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), value :: kind0
integer(C_FFTW_R2R_KIND), value :: kind1
integer(C_INT), value :: flags
end function fftw_mpi_plan_r2r_2d
type(C_PTR) function fftw_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) bind(C, name='fftw_mpi_plan_r2r_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
real(C_DOUBLE), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), value :: kind0
integer(C_FFTW_R2R_KIND), value :: kind1
integer(C_FFTW_R2R_KIND), value :: kind2
integer(C_INT), value :: flags
end function fftw_mpi_plan_r2r_3d
type(C_PTR) function fftw_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
bind(C, name='fftw_mpi_plan_many_dft_r2c_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
real(C_DOUBLE), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_many_dft_r2c
type(C_PTR) function fftw_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
real(C_DOUBLE), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_r2c
type(C_PTR) function fftw_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_DOUBLE), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_r2c_2d
type(C_PTR) function fftw_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
real(C_DOUBLE), dimension(*), intent(out) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_r2c_3d
type(C_PTR) function fftw_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
bind(C, name='fftw_mpi_plan_many_dft_c2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_many_dft_c2r
type(C_PTR) function fftw_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_c2r
type(C_PTR) function fftw_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_c2r_2d
type(C_PTR) function fftw_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftw_mpi_plan_dft_c2r_3d
subroutine fftw_mpi_gather_wisdom(comm_) bind(C, name='fftw_mpi_gather_wisdom_f03')
import
integer(C_MPI_FINT), value :: comm_
end subroutine fftw_mpi_gather_wisdom
subroutine fftw_mpi_broadcast_wisdom(comm_) bind(C, name='fftw_mpi_broadcast_wisdom_f03')
import
integer(C_MPI_FINT), value :: comm_
end subroutine fftw_mpi_broadcast_wisdom
subroutine fftw_mpi_execute_dft(p,in,out) bind(C, name='fftw_mpi_execute_dft')
import
type(C_PTR), value :: p
complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
end subroutine fftw_mpi_execute_dft
subroutine fftw_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftw_mpi_execute_dft_r2c')
import
type(C_PTR), value :: p
real(C_DOUBLE), dimension(*), intent(inout) :: in
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
end subroutine fftw_mpi_execute_dft_r2c
subroutine fftw_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftw_mpi_execute_dft_c2r')
import
type(C_PTR), value :: p
complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
end subroutine fftw_mpi_execute_dft_c2r
subroutine fftw_mpi_execute_r2r(p,in,out) bind(C, name='fftw_mpi_execute_r2r')
import
type(C_PTR), value :: p
real(C_DOUBLE), dimension(*), intent(inout) :: in
real(C_DOUBLE), dimension(*), intent(out) :: out
end subroutine fftw_mpi_execute_r2r
end interface
type, bind(C) :: fftwf_mpi_ddim
integer(C_INTPTR_T) n, ib, ob
end type fftwf_mpi_ddim
interface
subroutine fftwf_mpi_init() bind(C, name='fftwf_mpi_init')
import
end subroutine fftwf_mpi_init
subroutine fftwf_mpi_cleanup() bind(C, name='fftwf_mpi_cleanup')
import
end subroutine fftwf_mpi_cleanup
integer(C_INTPTR_T) function fftwf_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
local_n1,local_1_start) &
bind(C, name='fftwf_mpi_local_size_many_transposed_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_INTPTR_T), value :: block1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwf_mpi_local_size_many_transposed
integer(C_INTPTR_T) function fftwf_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
bind(C, name='fftwf_mpi_local_size_many_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwf_mpi_local_size_many
integer(C_INTPTR_T) function fftwf_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftwf_mpi_local_size_transposed_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwf_mpi_local_size_transposed
integer(C_INTPTR_T) function fftwf_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwf_mpi_local_size_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwf_mpi_local_size
integer(C_INTPTR_T) function fftwf_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
local_o_start) bind(C, name='fftwf_mpi_local_size_many_1d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: howmany
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
integer(C_INTPTR_T), intent(out) :: local_ni
integer(C_INTPTR_T), intent(out) :: local_i_start
integer(C_INTPTR_T), intent(out) :: local_no
integer(C_INTPTR_T), intent(out) :: local_o_start
end function fftwf_mpi_local_size_many_1d
integer(C_INTPTR_T) function fftwf_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
bind(C, name='fftwf_mpi_local_size_1d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
integer(C_INTPTR_T), intent(out) :: local_ni
integer(C_INTPTR_T), intent(out) :: local_i_start
integer(C_INTPTR_T), intent(out) :: local_no
integer(C_INTPTR_T), intent(out) :: local_o_start
end function fftwf_mpi_local_size_1d
integer(C_INTPTR_T) function fftwf_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
bind(C, name='fftwf_mpi_local_size_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwf_mpi_local_size_2d
integer(C_INTPTR_T) function fftwf_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftwf_mpi_local_size_2d_transposed_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwf_mpi_local_size_2d_transposed
integer(C_INTPTR_T) function fftwf_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
bind(C, name='fftwf_mpi_local_size_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwf_mpi_local_size_3d
integer(C_INTPTR_T) function fftwf_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftwf_mpi_local_size_3d_transposed_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwf_mpi_local_size_3d_transposed
type(C_PTR) function fftwf_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
bind(C, name='fftwf_mpi_plan_many_transpose_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_INTPTR_T), value :: block1
real(C_FLOAT), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_many_transpose
type(C_PTR) function fftwf_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_transpose_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_FLOAT), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_transpose
type(C_PTR) function fftwf_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
bind(C, name='fftwf_mpi_plan_many_dft_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block
integer(C_INTPTR_T), value :: tblock
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwf_mpi_plan_many_dft
type(C_PTR) function fftwf_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft
type(C_PTR) function fftwf_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_1d_f03')
import
integer(C_INTPTR_T), value :: n0
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_1d
type(C_PTR) function fftwf_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_2d
type(C_PTR) function fftwf_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_3d
type(C_PTR) function fftwf_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
bind(C, name='fftwf_mpi_plan_many_r2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
real(C_FLOAT), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
integer(C_INT), value :: flags
end function fftwf_mpi_plan_many_r2r
type(C_PTR) function fftwf_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwf_mpi_plan_r2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
real(C_FLOAT), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
integer(C_INT), value :: flags
end function fftwf_mpi_plan_r2r
type(C_PTR) function fftwf_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwf_mpi_plan_r2r_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_FLOAT), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), value :: kind0
integer(C_FFTW_R2R_KIND), value :: kind1
integer(C_INT), value :: flags
end function fftwf_mpi_plan_r2r_2d
type(C_PTR) function fftwf_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
bind(C, name='fftwf_mpi_plan_r2r_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
real(C_FLOAT), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), value :: kind0
integer(C_FFTW_R2R_KIND), value :: kind1
integer(C_FFTW_R2R_KIND), value :: kind2
integer(C_INT), value :: flags
end function fftwf_mpi_plan_r2r_3d
type(C_PTR) function fftwf_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
bind(C, name='fftwf_mpi_plan_many_dft_r2c_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
real(C_FLOAT), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_many_dft_r2c
type(C_PTR) function fftwf_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
real(C_FLOAT), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_r2c
type(C_PTR) function fftwf_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_FLOAT), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_r2c_2d
type(C_PTR) function fftwf_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
real(C_FLOAT), dimension(*), intent(out) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_r2c_3d
type(C_PTR) function fftwf_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
bind(C, name='fftwf_mpi_plan_many_dft_c2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_many_dft_c2r
type(C_PTR) function fftwf_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_c2r
type(C_PTR) function fftwf_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_c2r_2d
type(C_PTR) function fftwf_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwf_mpi_plan_dft_c2r_3d
subroutine fftwf_mpi_gather_wisdom(comm_) bind(C, name='fftwf_mpi_gather_wisdom_f03')
import
integer(C_MPI_FINT), value :: comm_
end subroutine fftwf_mpi_gather_wisdom
subroutine fftwf_mpi_broadcast_wisdom(comm_) bind(C, name='fftwf_mpi_broadcast_wisdom_f03')
import
integer(C_MPI_FINT), value :: comm_
end subroutine fftwf_mpi_broadcast_wisdom
subroutine fftwf_mpi_execute_dft(p,in,out) bind(C, name='fftwf_mpi_execute_dft')
import
type(C_PTR), value :: p
complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
end subroutine fftwf_mpi_execute_dft
subroutine fftwf_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwf_mpi_execute_dft_r2c')
import
type(C_PTR), value :: p
real(C_FLOAT), dimension(*), intent(inout) :: in
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
end subroutine fftwf_mpi_execute_dft_r2c
subroutine fftwf_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwf_mpi_execute_dft_c2r')
import
type(C_PTR), value :: p
complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
end subroutine fftwf_mpi_execute_dft_c2r
subroutine fftwf_mpi_execute_r2r(p,in,out) bind(C, name='fftwf_mpi_execute_r2r')
import
type(C_PTR), value :: p
real(C_FLOAT), dimension(*), intent(inout) :: in
real(C_FLOAT), dimension(*), intent(out) :: out
end subroutine fftwf_mpi_execute_r2r
end interface

221
fftw-3.3.10/mpi/fftw3-mpi.h Normal file
View File

@@ -0,0 +1,221 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* The following statement of license applies *only* to this header file,
* and *not* to the other files distributed with FFTW or derived therefrom:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/***************************** NOTE TO USERS *********************************
*
* THIS IS A HEADER FILE, NOT A MANUAL
*
* If you want to know how to use FFTW, please read the manual,
* online at http://www.fftw.org/doc/ and also included with FFTW.
* For a quick start, see the manual's tutorial section.
*
* (Reading header files to learn how to use a library is a habit
* stemming from code lacking a proper manual. Arguably, it's a
* *bad* habit in most cases, because header files can contain
* interfaces that are not part of the public, stable API.)
*
****************************************************************************/
#ifndef FFTW3_MPI_H
#define FFTW3_MPI_H
#include <fftw3.h>
#include <mpi.h>
#ifdef __cplusplus
extern "C"
{
#endif /* __cplusplus */
struct fftw_mpi_ddim_do_not_use_me {
ptrdiff_t n; /* dimension size */
ptrdiff_t ib; /* input block */
ptrdiff_t ob; /* output block */
};
/*
huge second-order macro that defines prototypes for all API
functions. We expand this macro for each supported precision
XM: name-mangling macro (MPI)
X: name-mangling macro (serial)
R: real data type
C: complex data type
*/
#define FFTW_MPI_DEFINE_API(XM, X, R, C) \
\
typedef struct fftw_mpi_ddim_do_not_use_me XM(ddim); \
\
FFTW_EXTERN void XM(init)(void); \
FFTW_EXTERN void XM(cleanup)(void); \
\
FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed) \
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_many) \
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
ptrdiff_t block0, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_transposed) \
(int rnk, const ptrdiff_t *n, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
FFTW_EXTERN ptrdiff_t XM(local_size) \
(int rnk, const ptrdiff_t *n, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_many_1d)( \
ptrdiff_t n0, ptrdiff_t howmany, \
MPI_Comm comm, int sign, unsigned flags, \
ptrdiff_t *local_ni, ptrdiff_t *local_i_start, \
ptrdiff_t *local_no, ptrdiff_t *local_o_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_1d)( \
ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags, \
ptrdiff_t *local_ni, ptrdiff_t *local_i_start, \
ptrdiff_t *local_no, ptrdiff_t *local_o_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_2d)( \
ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed)( \
ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_3d)( \
ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed)( \
ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm, \
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
\
FFTW_EXTERN X(plan) XM(plan_many_transpose) \
(ptrdiff_t n0, ptrdiff_t n1, \
ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, \
R *in, R *out, MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_transpose) \
(ptrdiff_t n0, ptrdiff_t n1, \
R *in, R *out, MPI_Comm comm, unsigned flags); \
\
FFTW_EXTERN X(plan) XM(plan_many_dft) \
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
ptrdiff_t block, ptrdiff_t tblock, C *in, C *out, \
MPI_Comm comm, int sign, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft) \
(int rnk, const ptrdiff_t *n, C *in, C *out, \
MPI_Comm comm, int sign, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_1d) \
(ptrdiff_t n0, C *in, C *out, \
MPI_Comm comm, int sign, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_2d) \
(ptrdiff_t n0, ptrdiff_t n1, C *in, C *out, \
MPI_Comm comm, int sign, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_3d) \
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, C *out, \
MPI_Comm comm, int sign, unsigned flags); \
\
FFTW_EXTERN X(plan) XM(plan_many_r2r) \
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
ptrdiff_t iblock, ptrdiff_t oblock, R *in, R *out, \
MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_r2r) \
(int rnk, const ptrdiff_t *n, R *in, R *out, \
MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_r2r_2d) \
(ptrdiff_t n0, ptrdiff_t n1, R *in, R *out, MPI_Comm comm, \
X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_r2r_3d) \
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, \
R *in, R *out, MPI_Comm comm, X(r2r_kind) kind0, \
X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags); \
\
FFTW_EXTERN X(plan) XM(plan_many_dft_r2c) \
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
ptrdiff_t iblock, ptrdiff_t oblock, R *in, C *out, \
MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_r2c) \
(int rnk, const ptrdiff_t *n, R *in, C *out, \
MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d) \
(ptrdiff_t n0, ptrdiff_t n1, R *in, C *out, \
MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d) \
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R *in, C *out, \
MPI_Comm comm, unsigned flags); \
\
FFTW_EXTERN X(plan) XM(plan_many_dft_c2r) \
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
ptrdiff_t iblock, ptrdiff_t oblock, C *in, R *out, \
MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_c2r) \
(int rnk, const ptrdiff_t *n, C *in, R *out, \
MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d) \
(ptrdiff_t n0, ptrdiff_t n1, C *in, R *out, \
MPI_Comm comm, unsigned flags); \
FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d) \
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, R *out, \
MPI_Comm comm, unsigned flags); \
\
FFTW_EXTERN void XM(gather_wisdom)(MPI_Comm comm_); \
FFTW_EXTERN void XM(broadcast_wisdom)(MPI_Comm comm_); \
\
FFTW_EXTERN void XM(execute_dft)(X(plan) p, C *in, C *out); \
FFTW_EXTERN void XM(execute_dft_r2c)(X(plan) p, R *in, C *out); \
FFTW_EXTERN void XM(execute_dft_c2r)(X(plan) p, C *in, R *out); \
FFTW_EXTERN void XM(execute_r2r)(X(plan) p, R *in, R *out);
/* end of FFTW_MPI_DEFINE_API macro */
#define FFTW_MPI_MANGLE_DOUBLE(name) FFTW_MANGLE_DOUBLE(FFTW_CONCAT(mpi_,name))
#define FFTW_MPI_MANGLE_FLOAT(name) FFTW_MANGLE_FLOAT(FFTW_CONCAT(mpi_,name))
#define FFTW_MPI_MANGLE_LONG_DOUBLE(name) FFTW_MANGLE_LONG_DOUBLE(FFTW_CONCAT(mpi_,name))
FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_DOUBLE, FFTW_MANGLE_DOUBLE, double, fftw_complex)
FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_FLOAT, FFTW_MANGLE_FLOAT, float, fftwf_complex)
FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_LONG_DOUBLE, FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
#define FFTW_MPI_DEFAULT_BLOCK (0)
/* MPI-specific flags */
#define FFTW_MPI_SCRAMBLED_IN (1U << 27)
#define FFTW_MPI_SCRAMBLED_OUT (1U << 28)
#define FFTW_MPI_TRANSPOSED_IN (1U << 29)
#define FFTW_MPI_TRANSPOSED_OUT (1U << 30)
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* FFTW3_MPI_H */

View File

@@ -0,0 +1,405 @@
! Generated automatically. DO NOT EDIT!
include 'fftw3l.f03'
type, bind(C) :: fftwl_mpi_ddim
integer(C_INTPTR_T) n, ib, ob
end type fftwl_mpi_ddim
interface
subroutine fftwl_mpi_init() bind(C, name='fftwl_mpi_init')
import
end subroutine fftwl_mpi_init
subroutine fftwl_mpi_cleanup() bind(C, name='fftwl_mpi_cleanup')
import
end subroutine fftwl_mpi_cleanup
integer(C_INTPTR_T) function fftwl_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
local_n1,local_1_start) &
bind(C, name='fftwl_mpi_local_size_many_transposed_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_INTPTR_T), value :: block1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwl_mpi_local_size_many_transposed
integer(C_INTPTR_T) function fftwl_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
bind(C, name='fftwl_mpi_local_size_many_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwl_mpi_local_size_many
integer(C_INTPTR_T) function fftwl_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftwl_mpi_local_size_transposed_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwl_mpi_local_size_transposed
integer(C_INTPTR_T) function fftwl_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwl_mpi_local_size_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwl_mpi_local_size
integer(C_INTPTR_T) function fftwl_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
local_o_start) bind(C, name='fftwl_mpi_local_size_many_1d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: howmany
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
integer(C_INTPTR_T), intent(out) :: local_ni
integer(C_INTPTR_T), intent(out) :: local_i_start
integer(C_INTPTR_T), intent(out) :: local_no
integer(C_INTPTR_T), intent(out) :: local_o_start
end function fftwl_mpi_local_size_many_1d
integer(C_INTPTR_T) function fftwl_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
bind(C, name='fftwl_mpi_local_size_1d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
integer(C_INTPTR_T), intent(out) :: local_ni
integer(C_INTPTR_T), intent(out) :: local_i_start
integer(C_INTPTR_T), intent(out) :: local_no
integer(C_INTPTR_T), intent(out) :: local_o_start
end function fftwl_mpi_local_size_1d
integer(C_INTPTR_T) function fftwl_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
bind(C, name='fftwl_mpi_local_size_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwl_mpi_local_size_2d
integer(C_INTPTR_T) function fftwl_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftwl_mpi_local_size_2d_transposed_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwl_mpi_local_size_2d_transposed
integer(C_INTPTR_T) function fftwl_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
bind(C, name='fftwl_mpi_local_size_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
end function fftwl_mpi_local_size_3d
integer(C_INTPTR_T) function fftwl_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
bind(C, name='fftwl_mpi_local_size_3d_transposed_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
integer(C_MPI_FINT), value :: comm
integer(C_INTPTR_T), intent(out) :: local_n0
integer(C_INTPTR_T), intent(out) :: local_0_start
integer(C_INTPTR_T), intent(out) :: local_n1
integer(C_INTPTR_T), intent(out) :: local_1_start
end function fftwl_mpi_local_size_3d_transposed
type(C_PTR) function fftwl_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
bind(C, name='fftwl_mpi_plan_many_transpose_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block0
integer(C_INTPTR_T), value :: block1
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_many_transpose
type(C_PTR) function fftwl_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_transpose_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_transpose
type(C_PTR) function fftwl_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
bind(C, name='fftwl_mpi_plan_many_dft_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: block
integer(C_INTPTR_T), value :: tblock
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwl_mpi_plan_many_dft
type(C_PTR) function fftwl_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft
type(C_PTR) function fftwl_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_1d_f03')
import
integer(C_INTPTR_T), value :: n0
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_1d
type(C_PTR) function fftwl_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_2d
type(C_PTR) function fftwl_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: sign
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_3d
type(C_PTR) function fftwl_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
bind(C, name='fftwl_mpi_plan_many_r2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
integer(C_INT), value :: flags
end function fftwl_mpi_plan_many_r2r
type(C_PTR) function fftwl_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwl_mpi_plan_r2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
integer(C_INT), value :: flags
end function fftwl_mpi_plan_r2r
type(C_PTR) function fftwl_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwl_mpi_plan_r2r_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), value :: kind0
integer(C_FFTW_R2R_KIND), value :: kind1
integer(C_INT), value :: flags
end function fftwl_mpi_plan_r2r_2d
type(C_PTR) function fftwl_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
bind(C, name='fftwl_mpi_plan_r2r_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_FFTW_R2R_KIND), value :: kind0
integer(C_FFTW_R2R_KIND), value :: kind1
integer(C_FFTW_R2R_KIND), value :: kind2
integer(C_INT), value :: flags
end function fftwl_mpi_plan_r2r_3d
type(C_PTR) function fftwl_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
bind(C, name='fftwl_mpi_plan_many_dft_r2c_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_many_dft_r2c
type(C_PTR) function fftwl_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_r2c
type(C_PTR) function fftwl_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_r2c_2d
type(C_PTR) function fftwl_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_r2c_3d
type(C_PTR) function fftwl_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
bind(C, name='fftwl_mpi_plan_many_dft_c2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
integer(C_INTPTR_T), value :: howmany
integer(C_INTPTR_T), value :: iblock
integer(C_INTPTR_T), value :: oblock
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_many_dft_c2r
type(C_PTR) function fftwl_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_f03')
import
integer(C_INT), value :: rnk
integer(C_INTPTR_T), dimension(*), intent(in) :: n
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_c2r
type(C_PTR) function fftwl_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_2d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_c2r_2d
type(C_PTR) function fftwl_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_3d_f03')
import
integer(C_INTPTR_T), value :: n0
integer(C_INTPTR_T), value :: n1
integer(C_INTPTR_T), value :: n2
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
integer(C_MPI_FINT), value :: comm
integer(C_INT), value :: flags
end function fftwl_mpi_plan_dft_c2r_3d
subroutine fftwl_mpi_gather_wisdom(comm_) bind(C, name='fftwl_mpi_gather_wisdom_f03')
import
integer(C_MPI_FINT), value :: comm_
end subroutine fftwl_mpi_gather_wisdom
subroutine fftwl_mpi_broadcast_wisdom(comm_) bind(C, name='fftwl_mpi_broadcast_wisdom_f03')
import
integer(C_MPI_FINT), value :: comm_
end subroutine fftwl_mpi_broadcast_wisdom
subroutine fftwl_mpi_execute_dft(p,in,out) bind(C, name='fftwl_mpi_execute_dft')
import
type(C_PTR), value :: p
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
end subroutine fftwl_mpi_execute_dft
subroutine fftwl_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwl_mpi_execute_dft_r2c')
import
type(C_PTR), value :: p
real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
end subroutine fftwl_mpi_execute_dft_r2c
subroutine fftwl_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwl_mpi_execute_dft_c2r')
import
type(C_PTR), value :: p
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
end subroutine fftwl_mpi_execute_dft_c2r
subroutine fftwl_mpi_execute_r2r(p,in,out) bind(C, name='fftwl_mpi_execute_r2r')
import
type(C_PTR), value :: p
real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
end subroutine fftwl_mpi_execute_r2r
end interface

78
fftw-3.3.10/mpi/genf03-wrap.pl Executable file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/perl -w
# Generate Fortran 2003 wrappers (which translate MPI_Comm from f2c) from
# function declarations of the form (one per line):
# extern <type> fftw_mpi_<name>(...args...)
# extern <type> fftw_mpi_<name>(...args...)
# ...
# with no line breaks within a given function. (It's too much work to
# write a general parser, since we just have to handle FFTW's header files.)
# Each declaration has at least one MPI_Comm argument.
sub canonicalize_type {
my($type);
($type) = @_;
$type =~ s/ +/ /g;
$type =~ s/^ //;
$type =~ s/ $//;
$type =~ s/([^\* ])\*/$1 \*/g;
$type =~ s/double/R/;
$type =~ s/fftw_([A-Za-z0-9_]+)/X(\1)/;
return $type;
}
while (<>) {
next if /^ *$/;
if (/^ *extern +([a-zA-Z_0-9 ]+[ \*]) *fftw_mpi_([a-zA-Z_0-9]+) *\((.*)\) *$/) {
$ret = &canonicalize_type($1);
$name = $2;
$args = $3;
print "\n$ret XM(${name}_f03)(";
$comma = "";
foreach $arg (split(/ *, */, $args)) {
$arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
$argtype = &canonicalize_type($1);
$argname = $2;
print $comma;
if ($argtype eq "MPI_Comm") {
print "MPI_Fint f_$argname";
}
else {
print "$argtype $argname";
}
$comma = ", ";
}
print ")\n{\n";
print " MPI_Comm ";
$comma = "";
foreach $arg (split(/ *, */, $args)) {
$arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
$argtype = &canonicalize_type($1);
$argname = $2;
if ($argtype eq "MPI_Comm") {
print "$comma$argname";
$comma = ", ";
}
}
print ";\n\n";
foreach $arg (split(/ *, */, $args)) {
$arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
$argtype = &canonicalize_type($1);
$argname = $2;
if ($argtype eq "MPI_Comm") {
print " $argname = MPI_Comm_f2c(f_$argname);\n";
}
}
$argnames = $args;
$argnames =~ s/([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) */$2/g;
print " ";
print "return " if ($ret ne "void");
print "XM($name)($argnames);\n}\n";
}
}

151
fftw-3.3.10/mpi/ifftw-mpi.h Normal file
View File

@@ -0,0 +1,151 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* FFTW-MPI internal header file */
#ifndef __IFFTW_MPI_H__
#define __IFFTW_MPI_H__
#include "kernel/ifftw.h"
#include "rdft/rdft.h"
#include <mpi.h>
/* mpi problem flags: problem-dependent meaning, but in general
SCRAMBLED means some reordering *within* the dimensions, while
TRANSPOSED means some reordering *of* the dimensions */
#define SCRAMBLED_IN (1 << 0)
#define SCRAMBLED_OUT (1 << 1)
#define TRANSPOSED_IN (1 << 2)
#define TRANSPOSED_OUT (1 << 3)
#define RANK1_BIGVEC_ONLY (1 << 4) /* for rank=1, allow only bigvec solver */
#define ONLY_SCRAMBLEDP(flags) (!((flags) & ~(SCRAMBLED_IN|SCRAMBLED_OUT)))
#define ONLY_TRANSPOSEDP(flags) (!((flags) & ~(TRANSPOSED_IN|TRANSPOSED_OUT)))
#if defined(FFTW_SINGLE)
# define FFTW_MPI_TYPE MPI_FLOAT
#elif defined(FFTW_LDOUBLE)
# define FFTW_MPI_TYPE MPI_LONG_DOUBLE
#elif defined(FFTW_QUAD)
# error MPI quad-precision type is unknown
#else
# define FFTW_MPI_TYPE MPI_DOUBLE
#endif
/* all fftw-mpi identifiers start with fftw_mpi (or fftwf_mpi etc.) */
#define XM(name) X(CONCAT(mpi_, name))
/***********************************************************************/
/* block distributions */
/* a distributed dimension of length n with input and output block
sizes ib and ob, respectively. */
typedef enum { IB = 0, OB } block_kind;
typedef struct {
INT n;
INT b[2]; /* b[IB], b[OB] */
} ddim;
/* Loop over k in {IB, OB}. Note: need explicit casts for C++. */
#define FORALL_BLOCK_KIND(k) for (k = IB; k <= OB; k = (block_kind) (((int) k) + 1))
/* unlike tensors in the serial FFTW, the ordering of the dtensor
dimensions matters - both the array and the block layout are
row-major order. */
typedef struct {
int rnk;
#if defined(STRUCT_HACK_KR)
ddim dims[1];
#elif defined(STRUCT_HACK_C99)
ddim dims[];
#else
ddim *dims;
#endif
} dtensor;
/* dtensor.c: */
dtensor *XM(mkdtensor)(int rnk);
void XM(dtensor_destroy)(dtensor *sz);
dtensor *XM(dtensor_copy)(const dtensor *sz);
dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress);
int XM(dtensor_validp)(const dtensor *sz);
void XM(dtensor_md5)(md5 *p, const dtensor *t);
void XM(dtensor_print)(const dtensor *t, printer *p);
/* block.c: */
/* for a single distributed dimension: */
INT XM(num_blocks)(INT n, INT block);
int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm);
INT XM(default_block)(INT n, int n_pes);
INT XM(block)(INT n, INT block, int which_block);
/* for multiple distributed dimensions: */
INT XM(num_blocks_total)(const dtensor *sz, block_kind k);
int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe);
void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe,
INT *coords);
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe);
int XM(is_local_after)(int dim, const dtensor *sz, block_kind k);
int XM(is_local)(const dtensor *sz, block_kind k);
int XM(is_block1d)(const dtensor *sz, block_kind k);
/* choose-radix.c */
INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
INT rblock[2], INT mblock[2]);
/***********************************************************************/
/* any_true.c */
int XM(any_true)(int condition, MPI_Comm comm);
int XM(md5_equal)(md5 m, MPI_Comm comm);
/* conf.c */
void XM(conf_standard)(planner *p);
/***********************************************************************/
/* rearrange.c */
/* Different ways to rearrange the vector dimension vn during transposition,
reflecting different tradeoffs between ease of transposition and
contiguity during the subsequent DFTs.
TODO: can we pare this down to CONTIG and DISCONTIG, at least
in MEASURE mode? SQUARE_MIDDLE is also used for 1d destroy-input DFTs. */
typedef enum {
CONTIG = 0, /* vn x 1: make subsequent DFTs contiguous */
DISCONTIG, /* P x (vn/P) for P processes */
SQUARE_BEFORE, /* try to get square transpose at beginning */
SQUARE_MIDDLE, /* try to get square transpose in the middle */
SQUARE_AFTER /* try to get square transpose at end */
} rearrangement;
/* skipping SQUARE_AFTER since it doesn't seem to offer any advantage
over SQUARE_BEFORE */
#define FORALL_REARRANGE(rearrange) for (rearrange = CONTIG; rearrange <= SQUARE_MIDDLE; rearrange = (rearrangement) (((int) rearrange) + 1))
int XM(rearrange_applicable)(rearrangement rearrange,
ddim dim0, INT vn, int n_pes);
INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes);
/***********************************************************************/
#endif /* __IFFTW_MPI_H__ */

844
fftw-3.3.10/mpi/mpi-bench.c Normal file
View File

@@ -0,0 +1,844 @@
/**************************************************************************/
/* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
It is probably NOT a good place to learn FFTW usage, since it has a
lot of added complexity in order to exercise and test the full API,
etcetera. We suggest reading the manual. */
/**************************************************************************/
#include <math.h>
#include <stdio.h>
#include <string.h>
#include "fftw3-mpi.h"
#include "tests/fftw-bench.h"
#if defined(BENCHFFT_SINGLE)
# define BENCH_MPI_TYPE MPI_FLOAT
#elif defined(BENCHFFT_LDOUBLE)
# define BENCH_MPI_TYPE MPI_LONG_DOUBLE
#elif defined(BENCHFFT_QUAD)
# error MPI quad-precision type is unknown
#else
# define BENCH_MPI_TYPE MPI_DOUBLE
#endif
#if SIZEOF_PTRDIFF_T == SIZEOF_INT
# define FFTW_MPI_PTRDIFF_T MPI_INT
#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
# define FFTW_MPI_PTRDIFF_T MPI_LONG
#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
# define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
#else
# error MPI type for ptrdiff_t is unknown
# define FFTW_MPI_PTRDIFF_T MPI_LONG
#endif
static const char *mkversion(void) { return FFTW(version); }
static const char *mkcc(void) { return FFTW(cc); }
static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
static const char *mknproc(void) {
static char buf[32];
int ncpus;
MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
#ifdef HAVE_SNPRINTF
snprintf(buf, 32, "%d", ncpus);
#else
sprintf(buf, "%d", ncpus);
#endif
return buf;
}
BEGIN_BENCH_DOC
BENCH_DOC("name", "fftw3_mpi")
BENCH_DOCF("version", mkversion)
BENCH_DOCF("cc", mkcc)
BENCH_DOCF("codelet-optim", mkcodelet_optim)
BENCH_DOCF("nproc", mknproc)
END_BENCH_DOC
static int n_pes = 1, my_pe = 0;
/* global variables describing the shape of the data and its distribution */
static int rnk;
static ptrdiff_t vn, iNtot, oNtot;
static ptrdiff_t *local_ni=0, *local_starti=0;
static ptrdiff_t *local_no=0, *local_starto=0;
static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
static ptrdiff_t *istrides = 0, *ostrides = 0;
static ptrdiff_t *total_ni=0, *total_no=0;
static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
static bench_real *local_in = 0, *local_out = 0;
static bench_real *all_local_in = 0, *all_local_out = 0;
static int all_local_in_alloc = 0, all_local_out_alloc = 0;
static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
static void alloc_rnk(int rnk_) {
rnk = rnk_;
bench_free(local_ni);
if (rnk == 0)
local_ni = 0;
else
local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
* (8 + n_pes * 4));
local_starti = local_ni + rnk;
local_no = local_ni + 2 * rnk;
local_starto = local_ni + 3 * rnk;
istrides = local_ni + 4 * rnk;
ostrides = local_ni + 5 * rnk;
total_ni = local_ni + 6 * rnk;
total_no = local_ni + 7 * rnk;
all_local_ni = local_ni + 8 * rnk;
all_local_starti = local_ni + (8 + n_pes) * rnk;
all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
}
static void setup_gather_scatter(void)
{
int i, j;
ptrdiff_t off;
MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
0, MPI_COMM_WORLD);
MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
0, MPI_COMM_WORLD);
MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
0, MPI_COMM_WORLD);
MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
0, MPI_COMM_WORLD);
MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
off = 0;
for (i = 0; i < n_pes; ++i) {
ptrdiff_t N = vn;
for (j = 0; j < rnk; ++j)
N *= all_local_ni[i * rnk + j];
isend_cnt[i] = N;
isend_off[i] = off;
off += N;
}
iNtot = off;
all_local_in_alloc = 1;
istrides[rnk - 1] = vn;
for (j = rnk - 2; j >= 0; --j)
istrides[j] = total_ni[j + 1] * istrides[j + 1];
off = 0;
for (i = 0; i < n_pes; ++i) {
ptrdiff_t N = vn;
for (j = 0; j < rnk; ++j)
N *= all_local_no[i * rnk + j];
orecv_cnt[i] = N;
orecv_off[i] = off;
off += N;
}
oNtot = off;
all_local_out_alloc = 1;
ostrides[rnk - 1] = vn;
for (j = rnk - 2; j >= 0; --j)
ostrides[j] = total_no[j + 1] * ostrides[j + 1];
}
static void copy_block_out(const bench_real *in,
int rnk, ptrdiff_t *n, ptrdiff_t *start,
ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
bench_real *out)
{
ptrdiff_t i;
if (rnk == 0) {
for (i = 0; i < vn; ++i)
out[i] = in[i];
}
else if (rnk == 1) { /* this case is just an optimization */
ptrdiff_t j;
out += start[0] * os[0];
for (j = 0; j < n[0]; ++j) {
for (i = 0; i < vn; ++i)
out[i] = in[i];
in += is;
out += os[0];
}
}
else {
/* we should do n[0] for locality, but this way is simpler to code */
for (i = 0; i < n[rnk - 1]; ++i)
copy_block_out(in + i * is,
rnk - 1, n, start, is * n[rnk - 1], os, vn,
out + (start[rnk - 1] + i) * os[rnk - 1]);
}
}
static void copy_block_in(bench_real *in,
int rnk, ptrdiff_t *n, ptrdiff_t *start,
ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
const bench_real *out)
{
ptrdiff_t i;
if (rnk == 0) {
for (i = 0; i < vn; ++i)
in[i] = out[i];
}
else if (rnk == 1) { /* this case is just an optimization */
ptrdiff_t j;
out += start[0] * os[0];
for (j = 0; j < n[0]; ++j) {
for (i = 0; i < vn; ++i)
in[i] = out[i];
in += is;
out += os[0];
}
}
else {
/* we should do n[0] for locality, but this way is simpler to code */
for (i = 0; i < n[rnk - 1]; ++i)
copy_block_in(in + i * is,
rnk - 1, n, start, is * n[rnk - 1], os, vn,
out + (start[rnk - 1] + i) * os[rnk - 1]);
}
}
static void do_scatter_in(bench_real *in)
{
bench_real *ali;
int i;
if (all_local_in_alloc) {
bench_free(all_local_in);
all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
all_local_in_alloc = 0;
}
ali = all_local_in;
for (i = 0; i < n_pes; ++i) {
copy_block_in(ali,
rnk, all_local_ni + i * rnk,
all_local_starti + i * rnk,
vn, istrides, vn,
in);
ali += isend_cnt[i];
}
MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
0, MPI_COMM_WORLD);
}
static void do_gather_out(bench_real *out)
{
bench_real *alo;
int i;
if (all_local_out_alloc) {
bench_free(all_local_out);
all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
all_local_out_alloc = 0;
}
MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
0, MPI_COMM_WORLD);
MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
alo = all_local_out;
for (i = 0; i < n_pes; ++i) {
copy_block_out(alo,
rnk, all_local_no + i * rnk,
all_local_starto + i * rnk,
vn, ostrides, vn,
out);
alo += orecv_cnt[i];
}
}
static void alloc_local(ptrdiff_t nreal, int inplace)
{
bench_free(local_in);
if (local_out != local_in) bench_free(local_out);
local_in = local_out = 0;
if (nreal > 0) {
ptrdiff_t i;
local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
if (inplace)
local_out = local_in;
else
local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
}
}
void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
{
UNUSED(p);
do_scatter_in(ri);
if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
}
void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
{
UNUSED(p);
if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
do_gather_out(ro);
}
void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
{
UNUSED(ii);
after_problem_rcopy_from(p, ri);
}
void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
{
UNUSED(io);
after_problem_rcopy_to(p, ro);
}
void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
{
UNUSED(ii);
after_problem_rcopy_from(p, ri);
}
void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
{
UNUSED(io);
after_problem_rcopy_to(p, ro);
}
static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny,
ptrdiff_t vn,
bench_real *in, bench_real *out)
{
FFTW(iodim64) hdims[3];
FFTW(r2r_kind) k[3];
FFTW(plan) pln;
hdims[0].n = nx;
hdims[0].is = ny * vn;
hdims[0].os = vn;
hdims[1].n = ny;
hdims[1].is = vn;
hdims[1].os = nx * vn;
hdims[2].n = vn;
hdims[2].is = 1;
hdims[2].os = 1;
k[0] = k[1] = k[2] = FFTW_R2HC;
pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
BENCH_ASSERT(pln != 0);
return pln;
}
static int tensor_rowmajor_transposedp(bench_tensor *t)
{
bench_iodim *d;
int i;
BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
if (t->rnk < 2)
return 0;
d = t->dims;
if (d[0].is != d[1].is * d[1].n
|| d[0].os != d[1].is
|| d[1].os != d[0].os * d[0].n)
return 0;
if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
return 0;
for (i = 2; i + 1 < t->rnk; ++i) {
d = t->dims + i;
if (d[0].is != d[1].is * d[1].n
|| d[0].os != d[1].os * d[1].n)
return 0;
}
if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
return 0;
return 1;
}
static int tensor_contiguousp(bench_tensor *t, int s)
{
return (t->dims[t->rnk-1].is == s
&& ((tensor_rowmajorp(t) &&
t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
|| tensor_rowmajor_transposedp(t)));
}
static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
{
FFTW(plan) pln = 0;
int i;
ptrdiff_t ntot;
vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
if (p->sz->rnk < 1
|| p->split
|| !tensor_contiguousp(p->sz, vn)
|| tensor_rowmajor_transposedp(p->sz)
|| p->vecsz->rnk > 1
|| (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|| p->vecsz->dims[0].os != 1)))
return 0;
alloc_rnk(p->sz->rnk);
for (i = 0; i < rnk; ++i) {
total_ni[i] = total_no[i] = p->sz->dims[i].n;
local_ni[i] = local_no[i] = total_ni[i];
local_starti[i] = local_starto[i] = 0;
}
if (rnk > 1) {
ptrdiff_t n, start, nT, startT;
ntot = FFTW(mpi_local_size_many_transposed)
(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
MPI_COMM_WORLD,
&n, &start, &nT, &startT);
if (flags & FFTW_MPI_TRANSPOSED_IN) {
local_ni[1] = nT;
local_starti[1] = startT;
}
else {
local_ni[0] = n;
local_starti[0] = start;
}
if (flags & FFTW_MPI_TRANSPOSED_OUT) {
local_no[1] = nT;
local_starto[1] = startT;
}
else {
local_no[0] = n;
local_starto[0] = start;
}
}
else if (rnk == 1) {
ntot = FFTW(mpi_local_size_many_1d)
(total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
local_ni, local_starti, local_no, local_starto);
}
alloc_local(ntot * 2, p->in == p->out);
pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
(FFTW(complex) *) local_in,
(FFTW(complex) *) local_out,
MPI_COMM_WORLD, p->sign, flags);
vn *= 2;
if (rnk > 1) {
ptrdiff_t nrest = 1;
for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
if (flags & FFTW_MPI_TRANSPOSED_IN)
plan_scramble_in = mkplan_transpose_local(
p->sz->dims[0].n, local_ni[1], vn * nrest,
local_in, local_in);
if (flags & FFTW_MPI_TRANSPOSED_OUT)
plan_unscramble_out = mkplan_transpose_local(
local_no[1], p->sz->dims[0].n, vn * nrest,
local_out, local_out);
}
return pln;
}
static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
{
return (t->dims[t->rnk-1].is == s
&& ((tensor_real_rowmajorp(t, sign, 1) &&
t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
}
static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
{
FFTW(plan) pln = 0;
int i;
ptrdiff_t ntot;
vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
if (p->sz->rnk < 2
|| p->split
|| !tensor_real_contiguousp(p->sz, p->sign, vn)
|| tensor_rowmajor_transposedp(p->sz)
|| p->vecsz->rnk > 1
|| (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|| p->vecsz->dims[0].os != 1)))
return 0;
alloc_rnk(p->sz->rnk);
for (i = 0; i < rnk; ++i) {
total_ni[i] = total_no[i] = p->sz->dims[i].n;
local_ni[i] = local_no[i] = total_ni[i];
local_starti[i] = local_starto[i] = 0;
}
local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1]
= p->sz->dims[rnk-1].n / 2 + 1;
{
ptrdiff_t n, start, nT, startT;
ntot = FFTW(mpi_local_size_many_transposed)
(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
MPI_COMM_WORLD,
&n, &start, &nT, &startT);
if (flags & FFTW_MPI_TRANSPOSED_IN) {
local_ni[1] = nT;
local_starti[1] = startT;
}
else {
local_ni[0] = n;
local_starti[0] = start;
}
if (flags & FFTW_MPI_TRANSPOSED_OUT) {
local_no[1] = nT;
local_starto[1] = startT;
}
else {
local_no[0] = n;
local_starto[0] = start;
}
}
alloc_local(ntot * 2, p->in == p->out);
total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
if (p->sign < 0)
pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
local_in,
(FFTW(complex) *) local_out,
MPI_COMM_WORLD, flags);
else
pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
(FFTW(complex) *) local_in,
local_out,
MPI_COMM_WORLD, flags);
total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
vn *= 2;
{
ptrdiff_t nrest = 1;
for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
if (flags & FFTW_MPI_TRANSPOSED_IN)
plan_scramble_in = mkplan_transpose_local(
total_ni[0], local_ni[1], vn * nrest,
local_in, local_in);
if (flags & FFTW_MPI_TRANSPOSED_OUT)
plan_unscramble_out = mkplan_transpose_local(
local_no[1], total_ni[0], vn * nrest,
local_out, local_out);
}
return pln;
}
static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
{
ptrdiff_t ntot, nx, ny;
int ix=0, iy=1, i;
const bench_iodim *d = p->vecsz->dims;
FFTW(plan) pln;
if (p->vecsz->rnk == 3) {
for (i = 0; i < 3; ++i)
if (d[i].is == 1 && d[i].os == 1) {
vn = d[i].n;
ix = (i + 1) % 3;
iy = (i + 2) % 3;
break;
}
if (i == 3) return 0;
}
else {
vn = 1;
ix = 0;
iy = 1;
}
if (d[ix].is == d[iy].n * vn && d[ix].os == vn
&& d[iy].os == d[ix].n * vn && d[iy].is == vn) {
nx = d[ix].n;
ny = d[iy].n;
}
else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
&& d[ix].os == d[iy].n * vn && d[ix].is == vn) {
nx = d[iy].n;
ny = d[ix].n;
}
else
return 0;
alloc_rnk(2);
ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
&local_ni[0],
&local_starti[0],
&local_no[0],
&local_starto[0]);
local_ni[1] = ny;
local_starti[1] = 0;
local_no[1] = nx;
local_starto[1] = 0;
total_ni[0] = nx; total_ni[1] = ny;
total_no[1] = nx; total_no[0] = ny;
alloc_local(ntot, p->in == p->out);
pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
local_in, local_out,
MPI_COMM_WORLD, flags);
if (flags & FFTW_MPI_TRANSPOSED_IN)
plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
local_in, local_in);
if (flags & FFTW_MPI_TRANSPOSED_OUT)
plan_unscramble_out = mkplan_transpose_local
(nx, local_no[0], vn, local_out, local_out);
#if 0
if (pln && vn == 1) {
int i, j;
bench_real *ri = (bench_real *) p->in;
bench_real *ro = (bench_real *) p->out;
if (!ri || !ro) return pln;
setup_gather_scatter();
for (i = 0; i < nx * ny; ++i)
ri[i] = i;
after_problem_rcopy_from(p, ri);
FFTW(execute)(pln);
after_problem_rcopy_to(p, ro);
if (my_pe == 0) {
for (i = 0; i < nx; ++i) {
for (j = 0; j < ny; ++j)
printf(" %3g", ro[j * nx + i]);
printf("\n");
}
}
}
#endif
return pln;
}
static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
{
FFTW(plan) pln = 0;
int i;
ptrdiff_t ntot;
FFTW(r2r_kind) *k;
if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
&& p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
return mkplan_transpose(p, flags);
vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
if (p->sz->rnk < 1
|| p->split
|| !tensor_contiguousp(p->sz, vn)
|| tensor_rowmajor_transposedp(p->sz)
|| p->vecsz->rnk > 1
|| (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|| p->vecsz->dims[0].os != 1)))
return 0;
alloc_rnk(p->sz->rnk);
for (i = 0; i < rnk; ++i) {
total_ni[i] = total_no[i] = p->sz->dims[i].n;
local_ni[i] = local_no[i] = total_ni[i];
local_starti[i] = local_starto[i] = 0;
}
if (rnk > 1) {
ptrdiff_t n, start, nT, startT;
ntot = FFTW(mpi_local_size_many_transposed)
(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
MPI_COMM_WORLD,
&n, &start, &nT, &startT);
if (flags & FFTW_MPI_TRANSPOSED_IN) {
local_ni[1] = nT;
local_starti[1] = startT;
}
else {
local_ni[0] = n;
local_starti[0] = start;
}
if (flags & FFTW_MPI_TRANSPOSED_OUT) {
local_no[1] = nT;
local_starto[1] = startT;
}
else {
local_no[0] = n;
local_starto[0] = start;
}
}
else if (rnk == 1) {
ntot = FFTW(mpi_local_size_many_1d)
(total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
local_ni, local_starti, local_no, local_starto);
}
alloc_local(ntot, p->in == p->out);
k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
for (i = 0; i < p->sz->rnk; ++i)
switch (p->k[i]) {
case R2R_R2HC: k[i] = FFTW_R2HC; break;
case R2R_HC2R: k[i] = FFTW_HC2R; break;
case R2R_DHT: k[i] = FFTW_DHT; break;
case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
default: BENCH_ASSERT(0);
}
pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn,
FFTW_MPI_DEFAULT_BLOCK,
FFTW_MPI_DEFAULT_BLOCK,
local_in, local_out,
MPI_COMM_WORLD, k, flags);
bench_free(k);
if (rnk > 1) {
ptrdiff_t nrest = 1;
for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
if (flags & FFTW_MPI_TRANSPOSED_IN)
plan_scramble_in = mkplan_transpose_local(
p->sz->dims[0].n, local_ni[1], vn * nrest,
local_in, local_in);
if (flags & FFTW_MPI_TRANSPOSED_OUT)
plan_unscramble_out = mkplan_transpose_local(
local_no[1], p->sz->dims[0].n, vn * nrest,
local_out, local_out);
}
return pln;
}
FFTW(plan) mkplan(bench_problem *p, unsigned flags)
{
FFTW(plan) pln = 0;
FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
if (p->scrambled_in) {
if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
flags |= FFTW_MPI_SCRAMBLED_IN;
else
flags |= FFTW_MPI_TRANSPOSED_IN;
}
if (p->scrambled_out) {
if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
flags |= FFTW_MPI_SCRAMBLED_OUT;
else
flags |= FFTW_MPI_TRANSPOSED_OUT;
}
switch (p->kind) {
case PROBLEM_COMPLEX:
pln =mkplan_complex(p, flags);
break;
case PROBLEM_REAL:
pln = mkplan_real(p, flags);
break;
case PROBLEM_R2R:
pln = mkplan_r2r(p, flags);
break;
default: BENCH_ASSERT(0);
}
if (pln) setup_gather_scatter();
return pln;
}
void main_init(int *argc, char ***argv)
{
#ifdef HAVE_SMP
# if MPI_VERSION >= 2 /* for MPI_Init_thread */
int provided;
MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
threads_ok = provided >= MPI_THREAD_FUNNELED;
# else
MPI_Init(argc, argv);
threads_ok = 0;
# endif
#else
MPI_Init(argc, argv);
#endif
MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
if (my_pe != 0) verbose = -999;
no_speed_allocation = 1; /* so we can benchmark transforms > memory */
always_pad_real = 1; /* out-of-place real transforms are padded */
isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
/* init_threads must be called before any other FFTW function,
including mpi_init, because it has to register the threads hooks
before the planner is initalized */
#ifdef HAVE_SMP
if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
#endif
FFTW(mpi_init)();
}
void initial_cleanup(void)
{
alloc_rnk(0);
alloc_local(0, 0);
bench_free(all_local_in); all_local_in = 0;
bench_free(all_local_out); all_local_out = 0;
bench_free(isend_off); isend_off = 0;
bench_free(isend_cnt); isend_cnt = 0;
bench_free(orecv_off); orecv_off = 0;
bench_free(orecv_cnt); orecv_cnt = 0;
FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
}
void final_cleanup(void)
{
MPI_Finalize();
}
void bench_exit(int status)
{
MPI_Abort(MPI_COMM_WORLD, status);
}
double bench_cost_postprocess(double cost)
{
double cost_max;
MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
return cost_max;
}
int import_wisdom(FILE *f)
{
int success = 1, sall;
if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
return sall;
}
void export_wisdom(FILE *f)
{
FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
}

59
fftw-3.3.10/mpi/mpi-dft.h Normal file
View File

@@ -0,0 +1,59 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* problem.c: */
typedef struct {
problem super;
dtensor *sz;
INT vn; /* vector length (vector stride 1) */
R *I, *O; /* contiguous interleaved arrays */
int sign; /* FFTW_FORWARD / FFTW_BACKWARD */
unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
SCRAMBLED_IN/OUT meaningful for 1d transforms only */
MPI_Comm comm;
} problem_mpi_dft;
problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
R *I, R *O, MPI_Comm comm,
int sign, unsigned flags);
problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
R *I, R *O, MPI_Comm comm,
int sign, unsigned flags);
/* solve.c: */
void XM(dft_solve)(const plan *ego_, const problem *p_);
/* plans have same operands as rdft plans, so just re-use */
typedef plan_rdft plan_mpi_dft;
#define MKPLAN_MPI_DFT(type, adt, apply) \
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
int XM(dft_serial_applicable)(const problem_mpi_dft *p);
/* various solvers */
void XM(dft_rank_geq2_register)(planner *p);
void XM(dft_rank_geq2_transposed_register)(planner *p);
void XM(dft_serial_register)(planner *p);
void XM(dft_rank1_bigvec_register)(planner *p);
void XM(dft_rank1_register)(planner *p);

View File

@@ -0,0 +1,66 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* problem.c: */
typedef struct {
problem super;
dtensor *sz;
INT vn; /* vector length (vector stride 1) */
R *I, *O; /* contiguous interleaved arrays */
unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
SCRAMBLED_IN/OUT meaningful for 1d transforms only */
MPI_Comm comm;
#if defined(STRUCT_HACK_KR)
rdft_kind kind[1];
#elif defined(STRUCT_HACK_C99)
rdft_kind kind[];
#else
rdft_kind *kind;
#endif
} problem_mpi_rdft;
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
R *I, R *O, MPI_Comm comm,
const rdft_kind *kind, unsigned flags);
problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
R *I, R *O, MPI_Comm comm,
const rdft_kind *kind, unsigned flags);
/* solve.c: */
void XM(rdft_solve)(const plan *ego_, const problem *p_);
/* plans have same operands as rdft plans, so just re-use */
typedef plan_rdft plan_mpi_rdft;
#define MKPLAN_MPI_RDFT(type, adt, apply) \
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
int XM(rdft_serial_applicable)(const problem_mpi_rdft *p);
/* various solvers */
void XM(rdft_rank_geq2_register)(planner *p);
void XM(rdft_rank_geq2_transposed_register)(planner *p);
void XM(rdft_serial_register)(planner *p);
void XM(rdft_rank1_bigvec_register)(planner *p);

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* r2c and c2r transforms. The sz dtensor, as usual, gives the size
of the "logical" complex array. For the last dimension N, however,
only N/2+1 complex numbers are stored for the complex data. Moreover,
for the real data, the last dimension is *always* padded to a size
2*(N/2+1). (Contrast this with the serial API, where there is only
padding for in-place plans.) */
/* problem.c: */
typedef struct {
problem super;
dtensor *sz;
INT vn; /* vector length (vector stride 1) */
R *I, *O; /* contiguous interleaved arrays */
rdft_kind kind; /* assert(kind < DHT) */
unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
SCRAMBLED_IN/OUT meaningful for 1d transforms only */
MPI_Comm comm;
} problem_mpi_rdft2;
problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
R *I, R *O, MPI_Comm comm,
rdft_kind kind, unsigned flags);
problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
R *I, R *O, MPI_Comm comm,
rdft_kind kind, unsigned flags);
/* solve.c: */
void XM(rdft2_solve)(const plan *ego_, const problem *p_);
/* plans have same operands as rdft plans, so just re-use */
typedef plan_rdft plan_mpi_rdft2;
#define MKPLAN_MPI_RDFT2(type, adt, apply) \
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p);
/* various solvers */
void XM(rdft2_rank_geq2_register)(planner *p);
void XM(rdft2_rank_geq2_transposed_register)(planner *p);
void XM(rdft2_serial_register)(planner *p);

View File

@@ -0,0 +1,61 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* tproblem.c: */
typedef struct {
problem super;
INT vn; /* vector length (vector stride 1) */
INT nx, ny; /* nx x ny transposed to ny x nx */
R *I, *O; /* contiguous real arrays (both same size!) */
unsigned flags; /* TRANSPOSED_IN: input is *locally* transposed
TRANSPOSED_OUT: output is *locally* transposed */
INT block, tblock; /* block size, slab decomposition;
tblock is for transposed blocks on output */
MPI_Comm comm;
} problem_mpi_transpose;
problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
R *I, R *O,
INT block, INT tblock,
MPI_Comm comm,
unsigned flags);
/* tsolve.c: */
void XM(transpose_solve)(const plan *ego_, const problem *p_);
/* plans have same operands as rdft plans, so just re-use */
typedef plan_rdft plan_mpi_transpose;
#define MKPLAN_MPI_TRANSPOSE(type, adt, apply) \
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
/* transpose-pairwise.c: */
int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
R *I, R *O, int my_pe,
plan **cld2, plan **cld2rest, plan **cld3,
INT *rest_Ioff, INT *rest_Ooff);
/* various solvers */
void XM(transpose_pairwise_register)(planner *p);
void XM(transpose_alltoall_register)(planner *p);
void XM(transpose_recurse_register)(planner *p);

View File

@@ -0,0 +1,155 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-rdft.h"
static void destroy(problem *ego_)
{
problem_mpi_rdft *ego = (problem_mpi_rdft *) ego_;
XM(dtensor_destroy)(ego->sz);
MPI_Comm_free(&ego->comm);
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
X(ifree0)(ego->kind);
#endif
X(ifree)(ego_);
}
static void hash(const problem *p_, md5 *m)
{
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
int i;
X(md5puts)(m, "mpi-dft");
X(md5int)(m, p->I == p->O);
/* don't include alignment -- may differ between processes
X(md5int)(m, X(ialignment_of)(p->I));
X(md5int)(m, X(ialignment_of)(p->O));
... note that applicability of MPI plans does not depend
on alignment (although optimality may, in principle). */
XM(dtensor_md5)(m, p->sz);
X(md5INT)(m, p->vn);
for (i = 0; i < p->sz->rnk; ++i)
X(md5int)(m, p->kind[i]);
X(md5int)(m, p->flags);
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
A(XM(md5_equal)(*m, p->comm));
}
static void print(const problem *ego_, printer *p)
{
const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
int i;
p->print(p, "(mpi-rdft %d %d %d ",
ego->I == ego->O,
X(ialignment_of)(ego->I),
X(ialignment_of)(ego->O));
XM(dtensor_print)(ego->sz, p);
for (i = 0; i < ego->sz->rnk; ++i)
p->print(p, " %d", (int)ego->kind[i]);
p->print(p, " %D %d", ego->vn, ego->flags);
MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
}
static void zero(const problem *ego_)
{
const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
R *I = ego->I;
INT i, N;
int my_pe;
MPI_Comm_rank(ego->comm, &my_pe);
N = ego->vn * XM(total_block)(ego->sz, IB, my_pe);
for (i = 0; i < N; ++i) I[i] = K(0.0);
}
static const problem_adt padt =
{
PROBLEM_MPI_RDFT,
hash,
zero,
print,
destroy
};
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
R *I, R *O,
MPI_Comm comm,
const rdft_kind *kind, unsigned flags)
{
problem_mpi_rdft *ego;
int i, rnk = sz->rnk;
int n_pes;
A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
MPI_Comm_size(comm, &n_pes);
A(n_pes >= XM(num_blocks_total)(sz, IB)
&& n_pes >= XM(num_blocks_total)(sz, OB));
A(vn >= 0);
#if defined(STRUCT_HACK_KR)
ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
+ sizeof(rdft_kind)
* (rnk > 0 ? rnk - 1 : 0), &padt);
#elif defined(STRUCT_HACK_C99)
ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
+ sizeof(rdft_kind) * rnk, &padt);
#else
ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt);
ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
#endif
/* enforce pointer equality if untainted pointers are equal */
if (UNTAINT(I) == UNTAINT(O))
I = O = JOIN_TAINT(I, O);
ego->sz = XM(dtensor_canonical)(sz, 0);
ego->vn = vn;
ego->I = I;
ego->O = O;
for (i = 0; i< ego->sz->rnk; ++i)
ego->kind[i] = kind[i];
/* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
swapping the first two dimensions (for rnk > 1) */
if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
rdft_kind k = ego->kind[0];
ddim dim0 = ego->sz->dims[0];
ego->sz->dims[0] = ego->sz->dims[1];
ego->sz->dims[1] = dim0;
ego->kind[0] = ego->kind[1];
ego->kind[1] = k;
flags &= ~TRANSPOSED_IN;
flags ^= TRANSPOSED_OUT;
}
ego->flags = flags;
MPI_Comm_dup(comm, &ego->comm);
return &(ego->super);
}
problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
R *I, R *O,
MPI_Comm comm,
const rdft_kind *kind, unsigned flags)
{
problem *p = XM(mkproblem_rdft)(sz, vn, I, O, comm, kind, flags);
XM(dtensor_destroy)(sz);
return p;
}

View File

@@ -0,0 +1,211 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex RDFTs of rank >= 2, for the case where we are distributed
across the first dimension only, and the output is transposed both
in data distribution and in ordering (for the first 2 dimensions).
(Note that we don't have to handle the case where the input is
transposed, since this is equivalent to transposed output with the
first two dimensions swapped, and is automatically canonicalized as
such by rdft-problem.c. */
#include "mpi-rdft.h"
#include "mpi-transpose.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_rdft super;
plan *cld1, *cldt, *cld2;
INT roff, ioff;
int preserve_input;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld1, *cld2, *cldt;
/* RDFT local dimensions */
cld1 = (plan_rdft *) ego->cld1;
if (ego->preserve_input) {
cld1->apply(ego->cld1, I, O);
I = O;
}
else
cld1->apply(ego->cld1, I, I);
/* global transpose */
cldt = (plan_rdft *) ego->cldt;
cldt->apply(ego->cldt, I, O);
/* RDFT final local dimension */
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, O, O);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
return (1
&& p->sz->rnk > 1
&& p->flags == TRANSPOSED_OUT
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& XM(is_local_after)(1, p->sz, IB)
&& XM(is_local_after)(2, p->sz, OB)
&& XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
&& (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
|| !XM(rdft_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cldt, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cldt);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-rdft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"",
ego->cld1, ego->cldt, ego->cld2);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_rdft *p;
P *pln;
plan *cld1 = 0, *cldt = 0, *cld2 = 0;
R *I, *O, *I2;
tensor *sz;
int i, my_pe, n_pes;
INT nrest;
static const plan_adt padt = {
XM(rdft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_rdft *) p_;
I2 = I = p->I;
O = p->O;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
I = O;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
i = p->sz->rnk - 2; A(i >= 0);
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = p->vn;
for (--i; i >= 0; --i) {
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
}
nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
{
INT is = sz->dims[0].n * sz->dims[0].is;
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
cld1 = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(sz,
X(mktensor_2d)(b, is, is,
p->vn, 1, 1),
I2, I, p->kind + 1));
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
nrest *= p->vn;
cldt = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
p->sz->dims[0].n, p->sz->dims[1].n, nrest,
I, O,
p->sz->dims[0].b[IB], p->sz->dims[1].b[OB],
p->comm, 0));
if (XM(any_true)(!cldt, p->comm)) goto nada;
{
INT is = p->sz->dims[0].n * nrest;
INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
cld2 = X(mkplan_d)(plnr,
X(mkproblem_rdft_1_d)(X(mktensor_1d)(
p->sz->dims[0].n,
nrest, nrest),
X(mktensor_2d)(b, is, is,
nrest, 1, 1),
O, O, p->kind[0]));
if (XM(any_true)(!cld2, p->comm)) goto nada;
}
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cldt = cldt;
pln->cld2 = cld2;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
X(ops_add2)(&cldt->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cldt);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(rdft_rank_geq2_transposed_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,179 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex RDFTs of rank >= 2, for the case where we are distributed
across the first dimension only, and the output is not transposed. */
#include "mpi-rdft.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_rdft super;
plan *cld1, *cld2;
int preserve_input;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld1, *cld2;
/* RDFT local dimensions */
cld1 = (plan_rdft *) ego->cld1;
if (ego->preserve_input) {
cld1->apply(ego->cld1, I, O);
I = O;
}
else
cld1->apply(ego->cld1, I, I);
/* RDFT non-local dimension (via rdft-rank1-bigvec, usually): */
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, I, O);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
return (1
&& p->sz->rnk > 1
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& XM(is_local_after)(1, p->sz, IB)
&& XM(is_local_after)(1, p->sz, OB)
&& (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
|| !XM(rdft_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-rdft-rank-geq2%s%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_rdft *p;
P *pln;
plan *cld1 = 0, *cld2 = 0;
R *I, *O, *I2;
tensor *sz;
dtensor *sz2;
int i, my_pe, n_pes;
INT nrest;
static const plan_adt padt = {
XM(rdft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_rdft *) p_;
I2 = I = p->I;
O = p->O;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
I = O;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
i = p->sz->rnk - 2; A(i >= 0);
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = p->vn;
for (--i; i >= 0; --i) {
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
}
nrest = X(tensor_sz)(sz);
{
INT is = sz->dims[0].n * sz->dims[0].is;
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
cld1 = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(sz,
X(mktensor_2d)(b, is, is,
p->vn, 1, 1),
I2, I, p->kind + 1));
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
sz2->dims[0] = p->sz->dims[0];
cld2 = X(mkplan_d)(plnr, XM(mkproblem_rdft_d)(sz2, nrest * p->vn,
I, O,
p->comm, p->kind,
RANK1_BIGVEC_ONLY));
if (XM(any_true)(!cld2, p->comm)) goto nada;
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(rdft_rank_geq2_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,205 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex RDFTs of rank == 1 when the vector length vn is >= # processes.
In this case, we don't need to use a six-step type algorithm, and can
instead transpose the RDFT dimension with the vector dimension to
make the RDFT local. */
#include "mpi-rdft.h"
#include "mpi-transpose.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
rearrangement rearrange;
} S;
typedef struct {
plan_mpi_rdft super;
plan *cldt_before, *cld, *cldt_after;
int preserve_input;
rearrangement rearrange;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld, *cldt_before, *cldt_after;
/* global transpose */
cldt_before = (plan_rdft *) ego->cldt_before;
cldt_before->apply(ego->cldt_before, I, O);
if (ego->preserve_input) I = O;
/* 1d RDFT(s) */
cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, O, I);
/* global transpose */
cldt_after = (plan_rdft *) ego->cldt_after;
cldt_after->apply(ego->cldt_after, I, O);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
int n_pes;
MPI_Comm_size(p->comm, &n_pes);
return (1
&& p->sz->rnk == 1
&& !(p->flags & ~RANK1_BIGVEC_ONLY)
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
#if 0 /* don't need this check since no other rank-1 rdft solver */
&& (p->vn >= n_pes /* TODO: relax this, using more memory? */
|| (p->flags & RANK1_BIGVEC_ONLY))
#endif
&& XM(rearrange_applicable)(ego->rearrange,
p->sz->dims[0], p->vn, n_pes)
&& (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
|| !XM(rdft_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cldt_before, wakefulness);
X(plan_awake)(ego->cld, wakefulness);
X(plan_awake)(ego->cldt_after, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldt_after);
X(plan_destroy_internal)(ego->cld);
X(plan_destroy_internal)(ego->cldt_before);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
const char descrip[][16] = { "contig", "discontig", "square-after",
"square-middle", "square-before" };
p->print(p, "(mpi-rdft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
ego->cldt_before, ego->cld, ego->cldt_after);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_rdft *p;
P *pln;
plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
R *I, *O;
INT yblock, yb, nx, ny, vn;
int my_pe, n_pes;
static const plan_adt padt = {
XM(rdft_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_rdft *) p_;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
nx = p->sz->dims[0].n;
if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
return (plan *) 0;
vn = p->vn / ny;
A(ny * vn == p->vn);
yblock = XM(default_block)(ny, n_pes);
cldt_before = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
nx, ny, vn,
I = p->I, O = p->O,
p->sz->dims[0].b[IB], yblock,
p->comm, 0));
if (XM(any_true)(!cldt_before, p->comm)) goto nada;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
yb = XM(block)(ny, yblock, my_pe);
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_1_d)(X(mktensor_1d)(nx, vn, vn),
X(mktensor_2d)(yb, vn*nx, vn*nx,
vn, 1, 1),
O, I, p->kind[0]));
if (XM(any_true)(!cld, p->comm)) goto nada;
cldt_after = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
ny, nx, vn,
I, O,
yblock, p->sz->dims[0].b[OB],
p->comm, 0));
if (XM(any_true)(!cldt_after, p->comm)) goto nada;
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
pln->cldt_before = cldt_before;
pln->cld = cld;
pln->cldt_after = cldt_after;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->rearrange = ego->rearrange;
X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldt_after);
X(plan_destroy_internal)(cld);
X(plan_destroy_internal)(cldt_before);
return (plan *) 0;
}
static solver *mksolver(rearrangement rearrange, int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->rearrange = rearrange;
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(rdft_rank1_bigvec_register)(planner *p)
{
rearrangement rearrange;
int preserve_input;
FORALL_REARRANGE(rearrange)
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
}

View File

@@ -0,0 +1,124 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* "MPI" RDFTs where all of the data is on one processor...just
call through to serial API. */
#include "mpi-rdft.h"
typedef struct {
plan_mpi_rdft super;
plan *cld;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld = (plan_rdft *) ego->cld;
cld->apply(ego->cld, I, O);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-rdft-serial %(%p%))", ego->cld);
}
int XM(rdft_serial_applicable)(const problem_mpi_rdft *p)
{
return (1
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
&& ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
|| p->vn == 0));
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
P *pln;
plan *cld;
int my_pe;
static const plan_adt padt = {
XM(rdft_solve), awake, print, destroy
};
UNUSED(ego);
/* check whether applicable: */
if (!XM(rdft_serial_applicable)(p))
return (plan *) 0;
MPI_Comm_rank(p->comm, &my_pe);
if (my_pe == 0 && p->vn > 0) {
int i, rnk = p->sz->rnk;
tensor *sz = X(mktensor)(rnk);
rdft_kind *kind
= (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = p->vn;
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
for (i = rnk - 1; i > 0; --i) {
sz->dims[i - 1].is = sz->dims[i - 1].os =
sz->dims[i].is * sz->dims[i].n;
sz->dims[i - 1].n = p->sz->dims[i - 1].n;
}
for (i = 0; i < rnk; ++i)
kind[i] = p->kind[i];
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_d)(sz,
X(mktensor_1d)(p->vn, 1, 1),
p->I, p->O, kind));
X(ifree0)(kind);
}
else { /* idle process: make nop plan */
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_1d)(0,0,0),
p->I, p->O));
}
if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
pln->cld = cld;
X(ops_cpy)(&cld->ops, &pln->super.super.ops);
return &(pln->super.super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
return MKSOLVER(solver, &sadt);
}
void XM(rdft_serial_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-rdft.h"
/* use the apply() operation for MPI_RDFT problems */
void XM(rdft_solve)(const plan *ego_, const problem *p_)
{
const plan_mpi_rdft *ego = (const plan_mpi_rdft *) ego_;
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
}

View File

@@ -0,0 +1,139 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-rdft2.h"
static void destroy(problem *ego_)
{
problem_mpi_rdft2 *ego = (problem_mpi_rdft2 *) ego_;
XM(dtensor_destroy)(ego->sz);
MPI_Comm_free(&ego->comm);
X(ifree)(ego_);
}
static void hash(const problem *p_, md5 *m)
{
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
int i;
X(md5puts)(m, "mpi-rdft2");
X(md5int)(m, p->I == p->O);
/* don't include alignment -- may differ between processes
X(md5int)(m, X(ialignment_of)(p->I));
X(md5int)(m, X(ialignment_of)(p->O));
... note that applicability of MPI plans does not depend
on alignment (although optimality may, in principle). */
XM(dtensor_md5)(m, p->sz);
X(md5INT)(m, p->vn);
X(md5int)(m, p->kind);
X(md5int)(m, p->flags);
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
A(XM(md5_equal)(*m, p->comm));
}
static void print(const problem *ego_, printer *p)
{
const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
int i;
p->print(p, "(mpi-rdft2 %d %d %d ",
ego->I == ego->O,
X(ialignment_of)(ego->I),
X(ialignment_of)(ego->O));
XM(dtensor_print)(ego->sz, p);
p->print(p, " %D %d %d", ego->vn, (int) ego->kind, ego->flags);
MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
}
static void zero(const problem *ego_)
{
const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
R *I = ego->I;
dtensor *sz;
INT i, N;
int my_pe;
sz = XM(dtensor_copy)(ego->sz);
sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
MPI_Comm_rank(ego->comm, &my_pe);
N = 2 * ego->vn * XM(total_block)(sz, IB, my_pe);
XM(dtensor_destroy)(sz);
for (i = 0; i < N; ++i) I[i] = K(0.0);
}
static const problem_adt padt =
{
PROBLEM_MPI_RDFT2,
hash,
zero,
print,
destroy
};
problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
R *I, R *O,
MPI_Comm comm,
rdft_kind kind,
unsigned flags)
{
problem_mpi_rdft2 *ego =
(problem_mpi_rdft2 *)X(mkproblem)(sizeof(problem_mpi_rdft2), &padt);
int n_pes;
A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk) && sz->rnk > 1);
MPI_Comm_size(comm, &n_pes);
A(vn >= 0);
A(kind == R2HC || kind == HC2R);
/* enforce pointer equality if untainted pointers are equal */
if (UNTAINT(I) == UNTAINT(O))
I = O = JOIN_TAINT(I, O);
ego->sz = XM(dtensor_canonical)(sz, 0);
#ifdef FFTW_DEBUG
ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
A(n_pes >= XM(num_blocks_total)(ego->sz, IB)
&& n_pes >= XM(num_blocks_total)(ego->sz, OB));
ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n;
#endif
ego->vn = vn;
ego->I = I;
ego->O = O;
ego->kind = kind;
/* We only support TRANSPOSED_OUT for r2c and TRANSPOSED_IN for
c2r transforms. */
ego->flags = flags;
MPI_Comm_dup(comm, &ego->comm);
return &(ego->super);
}
problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
R *I, R *O,
MPI_Comm comm,
rdft_kind kind,
unsigned flags)
{
problem *p = XM(mkproblem_rdft2)(sz, vn, I, O, comm, kind, flags);
XM(dtensor_destroy)(sz);
return p;
}

View File

@@ -0,0 +1,287 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Real-input (r2c) DFTs of rank >= 2, for the case where we are distributed
across the first dimension only, and the output is transposed both
in data distribution and in ordering (for the first 2 dimensions).
Conversely, real-output (c2r) DFTs where the input is transposed.
We don't currently support transposed-input r2c or transposed-output
c2r transforms. */
#include "mpi-rdft2.h"
#include "mpi-transpose.h"
#include "rdft/rdft.h"
#include "dft/dft.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_rdft2 super;
plan *cld1, *cldt, *cld2;
INT vn;
int preserve_input;
} P;
static void apply_r2c(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld1;
plan_dft *cld2;
plan_rdft *cldt;
/* RDFT2 local dimensions */
cld1 = (plan_rdft2 *) ego->cld1;
if (ego->preserve_input) {
cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
I = O;
}
else
cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
/* global transpose */
cldt = (plan_rdft *) ego->cldt;
cldt->apply(ego->cldt, I, O);
/* DFT final local dimension */
cld2 = (plan_dft *) ego->cld2;
cld2->apply(ego->cld2, O, O+1, O, O+1);
}
static void apply_c2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld1;
plan_dft *cld2;
plan_rdft *cldt;
/* IDFT local dimensions */
cld2 = (plan_dft *) ego->cld2;
if (ego->preserve_input) {
cld2->apply(ego->cld2, I+1, I, O+1, O);
I = O;
}
else
cld2->apply(ego->cld2, I+1, I, I+1, I);
/* global transpose */
cldt = (plan_rdft *) ego->cldt;
cldt->apply(ego->cldt, I, O);
/* RDFT2 final local dimension */
cld1 = (plan_rdft2 *) ego->cld1;
cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
return (1
&& p->sz->rnk > 1
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& ((p->flags == TRANSPOSED_OUT && p->kind == R2HC
&& XM(is_local_after)(1, p->sz, IB)
&& XM(is_local_after)(2, p->sz, OB)
&& XM(num_blocks)(p->sz->dims[0].n,
p->sz->dims[0].b[OB]) == 1)
||
(p->flags == TRANSPOSED_IN && p->kind == HC2R
&& XM(is_local_after)(1, p->sz, OB)
&& XM(is_local_after)(2, p->sz, IB)
&& XM(num_blocks)(p->sz->dims[0].n,
p->sz->dims[0].b[IB]) == 1))
&& (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
|| !XM(rdft2_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cldt, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cldt);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-rdft2-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"",
ego->cld1, ego->cldt, ego->cld2);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_rdft2 *p;
P *pln;
plan *cld1 = 0, *cldt = 0, *cld2 = 0;
R *r0, *r1, *cr, *ci, *ri, *ii, *ro, *io, *I, *O;
tensor *sz;
int i, my_pe, n_pes;
INT nrest, n1, b1;
static const plan_adt padt = {
XM(rdft2_solve), awake, print, destroy
};
block_kind k1, k2;
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_rdft2 *) p_;
I = p->I; O = p->O;
if (p->kind == R2HC) {
k1 = IB; k2 = OB;
r1 = (r0 = I) + p->vn;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
ci = (cr = O) + 1;
I = O;
}
else
ci = (cr = I) + 1;
io = ii = (ro = ri = O) + 1;
}
else {
k1 = OB; k2 = IB;
r1 = (r0 = O) + p->vn;
ci = (cr = O) + 1;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
ri = (ii = I) + 1;
ro = (io = O) + 1;
I = O;
}
else
ro = ri = (io = ii = I) + 1;
}
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
i = p->sz->rnk - 2; A(i >= 0);
sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
for (--i; i >= 0; --i) {
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
}
nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
{
INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
INT is = sz->dims[0].n * sz->dims[0].is;
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[k1], my_pe);
sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
cld1 = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(sz,
X(mktensor_2d)(b, is, is,
p->vn,ivs,ovs),
r0, r1, cr, ci, p->kind));
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
nrest *= p->vn;
n1 = p->sz->dims[1].n;
b1 = p->sz->dims[1].b[k2];
if (p->sz->rnk == 2) { /* n1 dimension is cut in ~half */
n1 = n1 / 2 + 1;
b1 = b1 == p->sz->dims[1].n ? n1 : b1;
}
if (p->kind == R2HC)
cldt = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
p->sz->dims[0].n, n1, nrest * 2,
I, O,
p->sz->dims[0].b[IB], b1,
p->comm, 0));
else
cldt = X(mkplan_d)(plnr,
XM(mkproblem_transpose)(
n1, p->sz->dims[0].n, nrest * 2,
I, O,
b1, p->sz->dims[0].b[OB],
p->comm, 0));
if (XM(any_true)(!cldt, p->comm)) goto nada;
{
INT is = p->sz->dims[0].n * nrest * 2;
INT b = XM(block)(n1, b1, my_pe);
cld2 = X(mkplan_d)(plnr,
X(mkproblem_dft_d)(X(mktensor_1d)(
p->sz->dims[0].n,
nrest * 2, nrest * 2),
X(mktensor_2d)(b, is, is,
nrest, 2, 2),
ri, ii, ro, io));
if (XM(any_true)(!cld2, p->comm)) goto nada;
}
pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
pln->cld1 = cld1;
pln->cldt = cldt;
pln->cld2 = cld2;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->vn = p->vn;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
X(ops_add2)(&cldt->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cldt);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(rdft2_rank_geq2_transposed_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,215 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Complex RDFT2s of rank >= 2, for the case where we are distributed
across the first dimension only, and the output is not transposed. */
#include "mpi-dft.h"
#include "mpi-rdft2.h"
#include "rdft/rdft.h"
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_rdft2 super;
plan *cld1, *cld2;
INT vn;
int preserve_input;
} P;
static void apply_r2c(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld1;
plan_rdft *cld2;
/* RDFT2 local dimensions */
cld1 = (plan_rdft2 *) ego->cld1;
if (ego->preserve_input) {
cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
I = O;
}
else
cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
/* DFT non-local dimension (via dft-rank1-bigvec, usually): */
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, I, O);
}
static void apply_c2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld1;
plan_rdft *cld2;
/* DFT non-local dimension (via dft-rank1-bigvec, usually): */
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, I, O);
/* RDFT2 local dimensions */
cld1 = (plan_rdft2 *) ego->cld1;
cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
return (1
&& p->sz->rnk > 1
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O
&& p->kind == R2HC))
&& XM(is_local_after)(1, p->sz, IB)
&& XM(is_local_after)(1, p->sz, OB)
&& (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
|| !XM(rdft2_serial_applicable)(p))
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-rdft2-rank-geq2%s%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_rdft2 *p;
P *pln;
plan *cld1 = 0, *cld2 = 0;
R *r0, *r1, *cr, *ci, *I, *O;
tensor *sz;
dtensor *sz2;
int i, my_pe, n_pes;
INT nrest;
static const plan_adt padt = {
XM(rdft2_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_rdft2 *) p_;
I = p->I; O = p->O;
if (p->kind == R2HC) {
r1 = (r0 = p->I) + p->vn;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
ci = (cr = p->O) + 1;
I = O;
}
else
ci = (cr = p->I) + 1;
}
else {
r1 = (r0 = p->O) + p->vn;
ci = (cr = p->O) + 1;
}
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
i = p->sz->rnk - 2; A(i >= 0);
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
for (--i; i >= 0; --i) {
sz->dims[i].n = p->sz->dims[i+1].n;
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
}
nrest = X(tensor_sz)(sz);
{
INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
INT is = sz->dims[0].n * sz->dims[0].is;
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
cld1 = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(sz,
X(mktensor_2d)(b, is, is,
p->vn,ivs,ovs),
r0, r1, cr, ci, p->kind));
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
sz2->dims[0] = p->sz->dims[0];
cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
I, O, p->comm,
p->kind == R2HC ?
FFT_SIGN : -FFT_SIGN,
RANK1_BIGVEC_ONLY));
if (XM(any_true)(!cld2, p->comm)) goto nada;
pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->vn = p->vn;
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(rdft2_rank_geq2_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,144 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* "MPI" DFTs where all of the data is on one processor...just
call through to serial API. */
#include "mpi-rdft2.h"
#include "rdft/rdft.h"
typedef struct {
plan_mpi_rdft2 super;
plan *cld;
INT vn;
} P;
static void apply_r2c(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld;
cld = (plan_rdft2 *) ego->cld;
cld->apply(ego->cld, I, I+ego->vn, O, O+1);
}
static void apply_c2r(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft2 *cld;
cld = (plan_rdft2 *) ego->cld;
cld->apply(ego->cld, O, O+ego->vn, I, I+1);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cld);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-rdft2-serial %(%p%))", ego->cld);
}
int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p)
{
return (1
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
&& ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
|| p->vn == 0));
}
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
P *pln;
plan *cld;
int my_pe;
R *r0, *r1, *cr, *ci;
static const plan_adt padt = {
XM(rdft2_solve), awake, print, destroy
};
UNUSED(ego);
/* check whether applicable: */
if (!XM(rdft2_serial_applicable)(p))
return (plan *) 0;
if (p->kind == R2HC) {
r1 = (r0 = p->I) + p->vn;
ci = (cr = p->O) + 1;
}
else {
r1 = (r0 = p->O) + p->vn;
ci = (cr = p->I) + 1;
}
MPI_Comm_rank(p->comm, &my_pe);
if (my_pe == 0 && p->vn > 0) {
INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
int i, rnk = p->sz->rnk;
tensor *sz = X(mktensor)(p->sz->rnk);
sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n / 2 + 1;
for (i = rnk - 1; i > 0; --i) {
sz->dims[i - 1].is = sz->dims[i - 1].os =
sz->dims[i].is * sz->dims[i].n;
sz->dims[i - 1].n = p->sz->dims[i - 1].n;
}
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(sz,
X(mktensor_1d)(p->vn,ivs,ovs),
r0, r1, cr, ci, p->kind));
}
else { /* idle process: make nop plan */
cld = X(mkplan_d)(plnr,
X(mkproblem_rdft2_d)(X(mktensor_0d)(),
X(mktensor_1d)(0,0,0),
cr, ci, cr, ci, HC2R));
}
if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
pln->cld = cld;
pln->vn = p->vn;
X(ops_cpy)(&cld->ops, &pln->super.super.ops);
return &(pln->super.super);
}
static solver *mksolver(void)
{
static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
return MKSOLVER(solver, &sadt);
}
void XM(rdft2_serial_register)(planner *p)
{
REGISTER_SOLVER(p, mksolver());
}

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-rdft2.h"
/* use the apply() operation for MPI_RDFT2 problems */
void XM(rdft2_solve)(const plan *ego_, const problem *p_)
{
const plan_mpi_rdft2 *ego = (const plan_mpi_rdft2 *) ego_;
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
}

View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* common functions for rearrangements of the data for the *-rank1-bigvec
solvers */
static int div_mult(INT b, INT a) {
return (a > b && a % b == 0);
}
static int div_mult2(INT b, INT a, INT n) {
return (div_mult(b, a) && div_mult(n, b));
}
int XM(rearrange_applicable)(rearrangement rearrange,
ddim dim0, INT vn, int n_pes)
{
/* note: it is important that cases other than CONTIG be
applicable only when the resulting transpose dimension
is divisible by n_pes; otherwise, the allocation size
returned by the API will be incorrect */
return ((rearrange != DISCONTIG || div_mult(n_pes, vn))
&& (rearrange != SQUARE_BEFORE
|| div_mult2(dim0.b[IB], vn, n_pes))
&& (rearrange != SQUARE_AFTER
|| (dim0.b[IB] != dim0.b[OB]
&& div_mult2(dim0.b[OB], vn, n_pes)))
&& (rearrange != SQUARE_MIDDLE
|| div_mult(dim0.n * n_pes, vn)));
}
INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes)
{
switch (rearrange) {
case CONTIG:
return vn;
case DISCONTIG:
return n_pes;
case SQUARE_BEFORE:
return dim0.b[IB];
case SQUARE_AFTER:
return dim0.b[OB];
case SQUARE_MIDDLE:
return dim0.n * n_pes;
}
return 0;
}

552
fftw-3.3.10/mpi/testsched.c Normal file
View File

@@ -0,0 +1,552 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/**********************************************************************/
/* This is a modified and combined version of the sched.c and
test_sched.c files shipped with FFTW 2, written to implement and
test various all-to-all communications scheduling patterns.
It is not used in FFTW 3, but I keep it around in case we ever want
to play with this again or to change algorithms. In particular, I
used it to implement and test the fill1_comm_sched routine in
transpose-pairwise.c, which allows us to create a schedule for one
process at a time and is much more compact than the FFTW 2 code.
Note that the scheduling algorithm is somewhat modified from that
of FFTW 2. Originally, I thought that one "stall" in the schedule
was unavoidable for odd numbers of processes, since this is the
case for the soccer-timetabling problem. However, because of the
self-communication step, we can use the self-communication to fill
in the stalls. (Thanks to Ralf Wildenhues for pointing this out.)
This greatly simplifies the process re-sorting algorithm. */
/**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
/* This file contains routines to compute communications schedules for
all-to-all communications (complete exchanges) that are performed
in-place. (That is, the block that processor x sends to processor
y gets replaced on processor x by a block received from processor y.)
A schedule, int **sched, is a two-dimensional array where
sched[pe][i] is the processor that pe expects to exchange a message
with on the i-th step of the exchange. sched[pe][i] == -1 for the
i after the last exchange scheduled on pe.
Here, processors (pe's, for processing elements), are numbered from
0 to npes-1.
There are a couple of constraints that a schedule should satisfy
(besides the obvious one that every processor has to communicate
with every other processor exactly once).
* First, and most importantly, there must be no deadlocks.
* Second, we would like to overlap communications as much as possible,
so that all exchanges occur in parallel. It turns out that perfect
overlap is possible for all number of processes (npes).
It turns out that this scheduling problem is actually well-studied,
and good solutions are known. The problem is known as a
"time-tabling" problem, and is specifically the problem of
scheduling a sports competition (where n teams must compete exactly
once with every other team). The problem is discussed and
algorithms are presented in:
[1] J. A. M. Schreuder, "Constructing Timetables for Sport
Competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
[2] A. Schaerf, "Scheduling Sport Tournaments using Constraint
Logic Programming," Proc. of 12th Europ. Conf. on
Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996).
http://hermes.dis.uniromal.it/~aschaerf/publications.html
(These people actually impose a lot of additional constraints that
we don't care about, so they are solving harder problems. [1] gives
a simple enough algorithm for our purposes, though.)
In the timetabling problem, N teams can all play one another in N-1
steps if N is even, and N steps if N is odd. Here, however,
there is a "self-communication" step (a team must also "play itself")
and so we can always make an optimal N-step schedule regardless of N.
However, we have to do more: for a particular processor, the
communications schedule must be sorted in ascending or descending
order of processor index. (This is necessary so that the data
coming in for the transpose does not overwrite data that will be
sent later; for that processor the incoming and outgoing blocks are
of different non-zero sizes.) Fortunately, because the schedule
is stall free, each parallel step of the schedule is independent
of every other step, and we can reorder the steps arbitrarily
to achieve any desired order on a particular process.
*/
void free_comm_schedule(int **sched, int npes)
{
if (sched) {
int i;
for (i = 0; i < npes; ++i)
free(sched[i]);
free(sched);
}
}
void empty_comm_schedule(int **sched, int npes)
{
int i;
for (i = 0; i < npes; ++i)
sched[i][0] = -1;
}
extern void fill_comm_schedule(int **sched, int npes);
/* Create a new communications schedule for a given number of processors.
The schedule is initialized to a deadlock-free, maximum overlap
schedule. Returns NULL on an error (may print a message to
stderr if there is a program bug detected). */
int **make_comm_schedule(int npes)
{
int **sched;
int i;
sched = (int **) malloc(sizeof(int *) * npes);
if (!sched)
return NULL;
for (i = 0; i < npes; ++i)
sched[i] = NULL;
for (i = 0; i < npes; ++i) {
sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1));
if (!sched[i]) {
free_comm_schedule(sched,npes);
return NULL;
}
}
empty_comm_schedule(sched,npes);
fill_comm_schedule(sched,npes);
if (!check_comm_schedule(sched,npes)) {
free_comm_schedule(sched,npes);
return NULL;
}
return sched;
}
static void add_dest_to_comm_schedule(int **sched, int pe, int dest)
{
int i;
for (i = 0; sched[pe][i] != -1; ++i)
;
sched[pe][i] = dest;
sched[pe][i+1] = -1;
}
static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2)
{
add_dest_to_comm_schedule(sched, pe1, pe2);
if (pe1 != pe2)
add_dest_to_comm_schedule(sched, pe2, pe1);
}
/* Simplification of algorithm presented in [1] (we have fewer
constraints). Produces a perfect schedule (npes steps). */
void fill_comm_schedule(int **sched, int npes)
{
int pe, i, n;
if (npes % 2 == 0) {
n = npes;
for (pe = 0; pe < npes; ++pe)
add_pair_to_comm_schedule(sched,pe,pe);
}
else
n = npes + 1;
for (pe = 0; pe < n - 1; ++pe) {
add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe);
for (i = 1; i < n/2; ++i) {
int pe_a, pe_b;
pe_a = pe - i;
if (pe_a < 0)
pe_a += n - 1;
pe_b = (pe + i) % (n - 1);
add_pair_to_comm_schedule(sched,pe_a,pe_b);
}
}
}
/* given an array sched[npes], fills it with the communications
schedule for process pe. */
void fill1_comm_sched(int *sched, int which_pe, int npes)
{
int pe, i, n, s = 0;
if (npes % 2 == 0) {
n = npes;
sched[s++] = which_pe;
}
else
n = npes + 1;
for (pe = 0; pe < n - 1; ++pe) {
if (npes % 2 == 0) {
if (pe == which_pe) sched[s++] = npes - 1;
else if (npes - 1 == which_pe) sched[s++] = pe;
}
else if (pe == which_pe) sched[s++] = pe;
if (pe != which_pe && which_pe < n - 1) {
i = (pe - which_pe + (n - 1)) % (n - 1);
if (i < n/2)
sched[s++] = (pe + i) % (n - 1);
i = (which_pe - pe + (n - 1)) % (n - 1);
if (i < n/2)
sched[s++] = (pe - i + (n - 1)) % (n - 1);
}
}
if (s != npes) {
fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n",
s, which_pe, npes);
exit(EXIT_FAILURE);
}
}
/* sort the communication schedule sched for npes so that the schedule
on process sortpe is ascending or descending (!ascending). */
static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
{
int *sortsched, i;
sortsched = (int *) malloc(npes * sizeof(int) * 2);
fill1_comm_sched(sortsched, sortpe, npes);
if (ascending)
for (i = 0; i < npes; ++i)
sortsched[npes + sortsched[i]] = sched[i];
else
for (i = 0; i < npes; ++i)
sortsched[2*npes - 1 - sortsched[i]] = sched[i];
for (i = 0; i < npes; ++i)
sched[i] = sortsched[npes + i];
free(sortsched);
}
/* Below, we have various checks in case of bugs: */
/* check for deadlocks by simulating the schedule and looking for
cycles in the dependency list; returns 0 if there are deadlocks
(or other errors) */
static int check_schedule_deadlock(int **sched, int npes)
{
int *step, *depend, *visited, pe, pe2, period, done = 0;
int counter = 0;
/* step[pe] is the step in the schedule that a given pe is on */
step = (int *) malloc(sizeof(int) * npes);
/* depend[pe] is the pe' that pe is currently waiting for a message
from (-1 if none) */
depend = (int *) malloc(sizeof(int) * npes);
/* visited[pe] tells whether we have visited the current pe already
when we are looking for cycles. */
visited = (int *) malloc(sizeof(int) * npes);
if (!step || !depend || !visited) {
free(step); free(depend); free(visited);
return 0;
}
for (pe = 0; pe < npes; ++pe)
step[pe] = 0;
while (!done) {
++counter;
for (pe = 0; pe < npes; ++pe)
depend[pe] = sched[pe][step[pe]];
/* now look for cycles in the dependencies with period > 2: */
for (pe = 0; pe < npes; ++pe)
if (depend[pe] != -1) {
for (pe2 = 0; pe2 < npes; ++pe2)
visited[pe2] = 0;
period = 0;
pe2 = pe;
do {
visited[pe2] = period + 1;
pe2 = depend[pe2];
period++;
} while (pe2 != -1 && !visited[pe2]);
if (pe2 == -1) {
fprintf(stderr,
"BUG: unterminated cycle in schedule!\n");
free(step); free(depend);
free(visited);
return 0;
}
if (period - (visited[pe2] - 1) > 2) {
fprintf(stderr,"BUG: deadlock in schedule!\n");
free(step); free(depend);
free(visited);
return 0;
}
if (pe2 == pe)
step[pe]++;
}
done = 1;
for (pe = 0; pe < npes; ++pe)
if (sched[pe][step[pe]] != -1) {
done = 0;
break;
}
}
free(step); free(depend); free(visited);
return (counter > 0 ? counter : 1);
}
/* sanity checks; prints message and returns 0 on failure.
undocumented feature: the return value on success is actually the
number of steps required for the schedule to complete, counting
stalls. */
int check_comm_schedule(int **sched, int npes)
{
int pe, i, comm_pe;
for (pe = 0; pe < npes; ++pe) {
for (comm_pe = 0; comm_pe < npes; ++comm_pe) {
for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i)
;
if (sched[pe][i] == -1) {
fprintf(stderr,"BUG: schedule never sends message from "
"%d to %d.\n",pe,comm_pe);
return 0; /* never send message to comm_pe */
}
}
for (i = 0; sched[pe][i] != -1; ++i)
;
if (i != npes) {
fprintf(stderr,"BUG: schedule sends too many messages from "
"%d\n",pe);
return 0;
}
}
return check_schedule_deadlock(sched,npes);
}
/* invert the order of all the schedules; this has no effect on
its required properties. */
void invert_comm_schedule(int **sched, int npes)
{
int pe, i;
for (pe = 0; pe < npes; ++pe)
for (i = 0; i < npes/2; ++i) {
int dummy = sched[pe][i];
sched[pe][i] = sched[pe][npes-1-i];
sched[pe][npes-1-i] = dummy;
}
}
/* Sort the schedule for sort_pe in ascending order of processor
index. Unfortunately, for odd npes (when schedule has a stall
to begin with) this will introduce an extra stall due to
the motion of the self-communication past a stall. We could
fix this if it were really important. Actually, we don't
get an extra stall when sort_pe == 0 or npes-1, which is sufficient
for our purposes. */
void sort_comm_schedule(int **sched, int npes, int sort_pe)
{
int i,j,pe;
/* Note that we can do this sort in O(npes) swaps because we know
that the numbers we are sorting are just 0...npes-1. But we'll
just do a bubble sort for simplicity here. */
for (i = 0; i < npes - 1; ++i)
for (j = i + 1; j < npes; ++j)
if (sched[sort_pe][i] > sched[sort_pe][j]) {
for (pe = 0; pe < npes; ++pe) {
int s = sched[pe][i];
sched[pe][i] = sched[pe][j];
sched[pe][j] = s;
}
}
}
/* print the schedule (for debugging purposes) */
void print_comm_schedule(int **sched, int npes)
{
int pe, i, width;
if (npes < 10)
width = 1;
else if (npes < 100)
width = 2;
else
width = 3;
for (pe = 0; pe < npes; ++pe) {
printf("pe %*d schedule:", width, pe);
for (i = 0; sched[pe][i] != -1; ++i)
printf(" %*d",width,sched[pe][i]);
printf("\n");
}
}
int main(int argc, char **argv)
{
int **sched;
int npes = -1, sortpe = -1, steps, i;
if (argc >= 2) {
npes = atoi(argv[1]);
if (npes <= 0) {
fprintf(stderr,"npes must be positive!");
return 1;
}
}
if (argc >= 3) {
sortpe = atoi(argv[2]);
if (sortpe < 0 || sortpe >= npes) {
fprintf(stderr,"sortpe must be between 0 and npes-1.\n");
return 1;
}
}
if (npes != -1) {
printf("Computing schedule for npes = %d:\n",npes);
sched = make_comm_schedule(npes);
if (!sched) {
fprintf(stderr,"Out of memory!");
return 6;
}
if (steps = check_comm_schedule(sched,npes))
printf("schedule OK (takes %d steps to complete).\n", steps);
else
printf("schedule not OK.\n");
print_comm_schedule(sched, npes);
if (sortpe != -1) {
printf("\nRe-creating schedule for pe = %d...\n", sortpe);
int *sched1 = (int*) malloc(sizeof(int) * npes);
for (i = 0; i < npes; ++i) sched1[i] = -1;
fill1_comm_sched(sched1, sortpe, npes);
printf(" =");
for (i = 0; i < npes; ++i)
printf(" %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3),
sched1[i]);
printf("\n");
printf("\nSorting schedule for sortpe = %d...\n", sortpe);
sort_comm_schedule(sched,npes,sortpe);
if (steps = check_comm_schedule(sched,npes))
printf("schedule OK (takes %d steps to complete).\n",
steps);
else
printf("schedule not OK.\n");
print_comm_schedule(sched, npes);
printf("\nInverting schedule...\n");
invert_comm_schedule(sched,npes);
if (steps = check_comm_schedule(sched,npes))
printf("schedule OK (takes %d steps to complete).\n",
steps);
else
printf("schedule not OK.\n");
print_comm_schedule(sched, npes);
free_comm_schedule(sched,npes);
free(sched1);
}
}
else {
printf("Doing infinite tests...\n");
for (npes = 1; ; ++npes) {
int *sched1 = (int*) malloc(sizeof(int) * npes);
printf("npes = %d...",npes);
sched = make_comm_schedule(npes);
if (!sched) {
fprintf(stderr,"Out of memory!\n");
return 5;
}
for (sortpe = 0; sortpe < npes; ++sortpe) {
empty_comm_schedule(sched,npes);
fill_comm_schedule(sched,npes);
if (!check_comm_schedule(sched,npes)) {
fprintf(stderr,
"\n -- fill error for sortpe = %d!\n",sortpe);
return 2;
}
for (i = 0; i < npes; ++i) sched1[i] = -1;
fill1_comm_sched(sched1, sortpe, npes);
for (i = 0; i < npes; ++i)
if (sched1[i] != sched[sortpe][i])
fprintf(stderr,
"\n -- fill1 error for pe = %d!\n",
sortpe);
sort_comm_schedule(sched,npes,sortpe);
if (!check_comm_schedule(sched,npes)) {
fprintf(stderr,
"\n -- sort error for sortpe = %d!\n",sortpe);
return 3;
}
invert_comm_schedule(sched,npes);
if (!check_comm_schedule(sched,npes)) {
fprintf(stderr,
"\n -- invert error for sortpe = %d!\n",
sortpe);
return 4;
}
}
free_comm_schedule(sched,npes);
printf("OK\n");
if (npes % 50 == 0)
printf("(...Hit Ctrl-C to stop...)\n");
free(sched1);
}
}
return 0;
}

View File

@@ -0,0 +1,265 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* plans for distributed out-of-place transpose using MPI_Alltoall,
and which destroy the input array (unless TRANSPOSED_IN is used) */
#include "mpi-transpose.h"
#include <string.h>
typedef struct {
solver super;
int copy_transposed_in; /* whether to copy the input for TRANSPOSED_IN,
which makes the final transpose out-of-place
but costs an extra copy and requires us
to destroy the input */
} S;
typedef struct {
plan_mpi_transpose super;
plan *cld1, *cld2, *cld2rest, *cld3;
MPI_Comm comm;
int *send_block_sizes, *send_block_offsets;
int *recv_block_sizes, *recv_block_offsets;
INT rest_Ioff, rest_Ooff;
int equal_blocks;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld1, *cld2, *cld2rest, *cld3;
/* transpose locally to get contiguous chunks */
cld1 = (plan_rdft *) ego->cld1;
if (cld1) {
cld1->apply(ego->cld1, I, O);
/* transpose chunks globally */
if (ego->equal_blocks)
MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
ego->comm);
else
MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
FFTW_MPI_TYPE,
I, ego->recv_block_sizes, ego->recv_block_offsets,
FFTW_MPI_TYPE,
ego->comm);
}
else { /* TRANSPOSED_IN, no need to destroy input */
/* transpose chunks globally */
if (ego->equal_blocks)
MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
ego->comm);
else
MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
FFTW_MPI_TYPE,
O, ego->recv_block_sizes, ego->recv_block_offsets,
FFTW_MPI_TYPE,
ego->comm);
I = O; /* final transpose (if any) is in-place */
}
/* transpose locally, again, to get ordinary row-major */
cld2 = (plan_rdft *) ego->cld2;
if (cld2) {
cld2->apply(ego->cld2, I, O);
cld2rest = (plan_rdft *) ego->cld2rest;
if (cld2rest) { /* leftover from unequal block sizes */
cld2rest->apply(ego->cld2rest,
I + ego->rest_Ioff, O + ego->rest_Ooff);
cld3 = (plan_rdft *) ego->cld3;
if (cld3)
cld3->apply(ego->cld3, O, O);
/* else TRANSPOSED_OUT is true and user wants O transposed */
}
}
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
return (1
&& p->I != p->O
&& (!NO_DESTROY_INPUTP(plnr) ||
((p->flags & TRANSPOSED_IN) && !ego->copy_transposed_in))
&& ((p->flags & TRANSPOSED_IN) || !ego->copy_transposed_in)
&& ONLY_TRANSPOSEDP(p->flags)
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
X(plan_awake)(ego->cld2rest, wakefulness);
X(plan_awake)(ego->cld3, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(ifree0)(ego->send_block_sizes);
MPI_Comm_free(&ego->comm);
X(plan_destroy_internal)(ego->cld3);
X(plan_destroy_internal)(ego->cld2rest);
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-transpose-alltoall%s%(%p%)%(%p%)%(%p%)%(%p%))",
ego->equal_blocks ? "/e" : "",
ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_transpose *p;
P *pln;
plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
INT b, bt, vn, rest_Ioff, rest_Ooff;
R *I;
int *sbs, *sbo, *rbs, *rbo;
int pe, my_pe, n_pes;
int equal_blocks = 1;
static const plan_adt padt = {
XM(transpose_solve), awake, print, destroy
};
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_transpose *) p_;
vn = p->vn;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
b = XM(block)(p->nx, p->block, my_pe);
if (p->flags & TRANSPOSED_IN) { /* I is already transposed */
if (ego->copy_transposed_in) {
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_1d)
(b * p->ny * vn, 1, 1),
I = p->I, p->O),
0, 0, NO_SLOW);
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
else
I = p->O; /* final transpose is in-place */
}
else { /* transpose b x ny x vn -> ny x b x vn */
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_3d)
(b, p->ny * vn, vn,
p->ny, vn, b * vn,
vn, 1, 1),
I = p->I, p->O),
0, 0, NO_SLOW);
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, p->O, my_pe,
&cld2, &cld2rest, &cld3,
&rest_Ioff, &rest_Ooff),
p->comm)) goto nada;
pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->cld2rest = cld2rest;
pln->rest_Ioff = rest_Ioff;
pln->rest_Ooff = rest_Ooff;
pln->cld3 = cld3;
MPI_Comm_dup(p->comm, &pln->comm);
/* Compute sizes/offsets of blocks to send for all-to-all command. */
sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
sbo = sbs + n_pes;
rbs = sbo + n_pes;
rbo = rbs + n_pes;
b = XM(block)(p->nx, p->block, my_pe);
bt = XM(block)(p->ny, p->tblock, my_pe);
for (pe = 0; pe < n_pes; ++pe) {
INT db, dbt; /* destination block sizes */
db = XM(block)(p->nx, p->block, pe);
dbt = XM(block)(p->ny, p->tblock, pe);
if (db != p->block || dbt != p->tblock)
equal_blocks = 0;
/* MPI requires type "int" here; apparently it
has no 64-bit API? Grrr. */
sbs[pe] = (int) (b * dbt * vn);
sbo[pe] = (int) (pe * (b * p->tblock) * vn);
rbs[pe] = (int) (db * bt * vn);
rbo[pe] = (int) (pe * (p->block * bt) * vn);
}
pln->send_block_sizes = sbs;
pln->send_block_offsets = sbo;
pln->recv_block_sizes = rbs;
pln->recv_block_offsets = rbo;
pln->equal_blocks = equal_blocks;
X(ops_zero)(&pln->super.super.ops);
if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
/* FIXME: should MPI operations be counted in "other" somehow? */
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld3);
X(plan_destroy_internal)(cld2rest);
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int copy_transposed_in)
{
static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->copy_transposed_in = copy_transposed_in;
return &(slv->super);
}
void XM(transpose_alltoall_register)(planner *p)
{
int cti;
for (cti = 0; cti <= 1; ++cti)
REGISTER_SOLVER(p, mksolver(cti));
}

View File

@@ -0,0 +1,487 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Distributed transposes using a sequence of carefully scheduled
pairwise exchanges. This has the advantage that it can be done
in-place, or out-of-place while preserving the input, using buffer
space proportional to the local size divided by the number of
processes (i.e. to the total array size divided by the number of
processes squared). */
#include "mpi-transpose.h"
#include <string.h>
typedef struct {
solver super;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_transpose super;
plan *cld1, *cld2, *cld2rest, *cld3;
INT rest_Ioff, rest_Ooff;
int n_pes, my_pe, *sched;
INT *send_block_sizes, *send_block_offsets;
INT *recv_block_sizes, *recv_block_offsets;
MPI_Comm comm;
int preserve_input;
} P;
static void transpose_chunks(int *sched, int n_pes, int my_pe,
INT *sbs, INT *sbo, INT *rbs, INT *rbo,
MPI_Comm comm,
R *I, R *O)
{
if (sched) {
int i;
MPI_Status status;
/* TODO: explore non-synchronous send/recv? */
if (I == O) {
R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
for (i = 0; i < n_pes; ++i) {
int pe = sched[i];
if (my_pe == pe) {
if (rbo[pe] != sbo[pe])
memmove(O + rbo[pe], O + sbo[pe],
sbs[pe] * sizeof(R));
}
else {
memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
pe, (my_pe * n_pes + pe) & 0x7fff,
O + rbo[pe], (int) (rbs[pe]),
FFTW_MPI_TYPE,
pe, (pe * n_pes + my_pe) & 0x7fff,
comm, &status);
}
}
X(ifree)(buf);
}
else { /* I != O */
for (i = 0; i < n_pes; ++i) {
int pe = sched[i];
if (my_pe == pe)
memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
else
MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
FFTW_MPI_TYPE,
pe, (my_pe * n_pes + pe) & 0x7fff,
O + rbo[pe], (int) (rbs[pe]),
FFTW_MPI_TYPE,
pe, (pe * n_pes + my_pe) & 0x7fff,
comm, &status);
}
}
}
}
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld1, *cld2, *cld2rest, *cld3;
/* transpose locally to get contiguous chunks */
cld1 = (plan_rdft *) ego->cld1;
if (cld1) {
cld1->apply(ego->cld1, I, O);
if (ego->preserve_input) I = O;
/* transpose chunks globally */
transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
ego->send_block_sizes, ego->send_block_offsets,
ego->recv_block_sizes, ego->recv_block_offsets,
ego->comm, O, I);
}
else if (ego->preserve_input) {
/* transpose chunks globally */
transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
ego->send_block_sizes, ego->send_block_offsets,
ego->recv_block_sizes, ego->recv_block_offsets,
ego->comm, I, O);
I = O;
}
else {
/* transpose chunks globally */
transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
ego->send_block_sizes, ego->send_block_offsets,
ego->recv_block_sizes, ego->recv_block_offsets,
ego->comm, I, I);
}
/* transpose locally, again, to get ordinary row-major;
this may take two transposes if the block sizes are unequal
(3 subplans, two of which operate on disjoint data) */
cld2 = (plan_rdft *) ego->cld2;
cld2->apply(ego->cld2, I, O);
cld2rest = (plan_rdft *) ego->cld2rest;
if (cld2rest) {
cld2rest->apply(ego->cld2rest,
I + ego->rest_Ioff, O + ego->rest_Ooff);
cld3 = (plan_rdft *) ego->cld3;
if (cld3)
cld3->apply(ego->cld3, O, O);
/* else TRANSPOSED_OUT is true and user wants O transposed */
}
}
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
/* Note: this is *not* UGLY for out-of-place, destroy-input plans;
the planner often prefers transpose-pairwise to transpose-alltoall,
at least with LAM MPI on my machine. */
return (1
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& ONLY_TRANSPOSEDP(p->flags));
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cld2, wakefulness);
X(plan_awake)(ego->cld2rest, wakefulness);
X(plan_awake)(ego->cld3, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(ifree0)(ego->sched);
X(ifree0)(ego->send_block_sizes);
MPI_Comm_free(&ego->comm);
X(plan_destroy_internal)(ego->cld3);
X(plan_destroy_internal)(ego->cld2rest);
X(plan_destroy_internal)(ego->cld2);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-transpose-pairwise%s%(%p%)%(%p%)%(%p%)%(%p%))",
ego->preserve_input==2 ?"/p":"",
ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
}
/* Given a process which_pe and a number of processes npes, fills
the array sched[npes] with a sequence of processes to communicate
with for a deadlock-free, optimum-overlap all-to-all communication.
(All processes must call this routine to get their own schedules.)
The schedule can be re-ordered arbitrarily as long as all processes
apply the same permutation to their schedules.
The algorithm here is based upon the one described in:
J. A. M. Schreuder, "Constructing timetables for sport
competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
In a sport competition, you have N teams and want every team to
play every other team in as short a time as possible (maximum overlap
between games). This timetabling problem is therefore identical
to that of an all-to-all communications problem. In our case, there
is one wrinkle: as part of the schedule, the process must do
some data transfer with itself (local data movement), analogous
to a requirement that each team "play itself" in addition to other
teams. With this wrinkle, it turns out that an optimal timetable
(N parallel games) can be constructed for any N, not just for even
N as in the original problem described by Schreuder.
*/
static void fill1_comm_sched(int *sched, int which_pe, int npes)
{
int pe, i, n, s = 0;
A(which_pe >= 0 && which_pe < npes);
if (npes % 2 == 0) {
n = npes;
sched[s++] = which_pe;
}
else
n = npes + 1;
for (pe = 0; pe < n - 1; ++pe) {
if (npes % 2 == 0) {
if (pe == which_pe) sched[s++] = npes - 1;
else if (npes - 1 == which_pe) sched[s++] = pe;
}
else if (pe == which_pe) sched[s++] = pe;
if (pe != which_pe && which_pe < n - 1) {
i = (pe - which_pe + (n - 1)) % (n - 1);
if (i < n/2)
sched[s++] = (pe + i) % (n - 1);
i = (which_pe - pe + (n - 1)) % (n - 1);
if (i < n/2)
sched[s++] = (pe - i + (n - 1)) % (n - 1);
}
}
A(s == npes);
}
/* Sort the communication schedule sched for npes so that the schedule
on process sortpe is ascending or descending (!ascending). This is
necessary to allow in-place transposes when the problem does not
divide equally among the processes. In this case there is one
process where the incoming blocks are bigger/smaller than the
outgoing blocks and thus have to be received in
descending/ascending order, respectively, to avoid overwriting data
before it is sent. */
static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
{
int *sortsched, i;
sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
fill1_comm_sched(sortsched, sortpe, npes);
if (ascending)
for (i = 0; i < npes; ++i)
sortsched[npes + sortsched[i]] = sched[i];
else
for (i = 0; i < npes; ++i)
sortsched[2*npes - 1 - sortsched[i]] = sched[i];
for (i = 0; i < npes; ++i)
sched[i] = sortsched[npes + i];
X(ifree)(sortsched);
}
/* make the plans to do the post-MPI transpositions (shared with
transpose-alltoall) */
int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
R *I, R *O, int my_pe,
plan **cld2, plan **cld2rest, plan **cld3,
INT *rest_Ioff, INT *rest_Ooff)
{
INT vn = p->vn;
INT b = p->block;
INT bt = XM(block)(p->ny, p->tblock, my_pe);
INT nxb = p->nx / b; /* number of equal-sized blocks */
INT nxr = p->nx - nxb * b; /* leftover rows after equal blocks */
*cld2 = *cld2rest = *cld3 = NULL;
*rest_Ioff = *rest_Ooff = 0;
if (!(p->flags & TRANSPOSED_OUT) && (nxr == 0 || I != O)) {
INT nx = p->nx * vn;
b *= vn;
*cld2 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_3d)
(nxb, bt * b, b,
bt, b, nx,
b, 1, 1),
I, O),
0, 0, NO_SLOW);
if (!*cld2) goto nada;
if (nxr > 0) {
*rest_Ioff = nxb * bt * b;
*rest_Ooff = nxb * b;
b = nxr * vn;
*cld2rest = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_2d)
(bt, b, nx,
b, 1, 1),
I + *rest_Ioff,
O + *rest_Ooff),
0, 0, NO_SLOW);
if (!*cld2rest) goto nada;
}
}
else {
*cld2 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(
X(mktensor_4d)
(nxb, bt * b * vn, bt * b * vn,
bt, b * vn, vn,
b, vn, bt * vn,
vn, 1, 1),
I, O),
0, 0, NO_SLOW);
if (!*cld2) goto nada;
*rest_Ioff = *rest_Ooff = nxb * bt * b * vn;
*cld2rest = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(
X(mktensor_3d)
(bt, nxr * vn, vn,
nxr, vn, bt * vn,
vn, 1, 1),
I + *rest_Ioff, O + *rest_Ooff),
0, 0, NO_SLOW);
if (!*cld2rest) goto nada;
if (!(p->flags & TRANSPOSED_OUT)) {
*cld3 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(
X(mktensor_3d)
(p->nx, bt * vn, vn,
bt, vn, p->nx * vn,
vn, 1, 1),
O, O),
0, 0, NO_SLOW);
if (!*cld3) goto nada;
}
}
return 1;
nada:
X(plan_destroy_internal)(*cld3);
X(plan_destroy_internal)(*cld2rest);
X(plan_destroy_internal)(*cld2);
*cld2 = *cld2rest = *cld3 = NULL;
return 0;
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_transpose *p;
P *pln;
plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
INT b, bt, vn, rest_Ioff, rest_Ooff;
INT *sbs, *sbo, *rbs, *rbo;
int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
R *I, *O;
static const plan_adt padt = {
XM(transpose_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr))
return (plan *) 0;
p = (const problem_mpi_transpose *) p_;
vn = p->vn;
I = p->I; O = p->O;
MPI_Comm_rank(p->comm, &my_pe);
MPI_Comm_size(p->comm, &n_pes);
b = XM(block)(p->nx, p->block, my_pe);
if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_3d)
(b, p->ny * vn, vn,
p->ny, vn, b * vn,
vn, 1, 1),
I, O),
0, 0, NO_SLOW);
if (XM(any_true)(!cld1, p->comm)) goto nada;
}
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe,
&cld2, &cld2rest, &cld3,
&rest_Ioff, &rest_Ooff),
p->comm)) goto nada;
pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
pln->cld1 = cld1;
pln->cld2 = cld2;
pln->cld2rest = cld2rest;
pln->rest_Ioff = rest_Ioff;
pln->rest_Ooff = rest_Ooff;
pln->cld3 = cld3;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
MPI_Comm_dup(p->comm, &pln->comm);
n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
XM(num_blocks)(p->ny, p->tblock));
/* Compute sizes/offsets of blocks to exchange between processors */
sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
sbo = sbs + n_pes;
rbs = sbo + n_pes;
rbo = rbs + n_pes;
b = XM(block)(p->nx, p->block, my_pe);
bt = XM(block)(p->ny, p->tblock, my_pe);
for (pe = 0; pe < n_pes; ++pe) {
INT db, dbt; /* destination block sizes */
db = XM(block)(p->nx, p->block, pe);
dbt = XM(block)(p->ny, p->tblock, pe);
sbs[pe] = b * dbt * vn;
sbo[pe] = pe * (b * p->tblock) * vn;
rbs[pe] = db * bt * vn;
rbo[pe] = pe * (p->block * bt) * vn;
if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
A(sort_pe == -1); /* only one process should need sorting */
sort_pe = pe;
ascending = db * p->tblock > p->block * dbt;
}
}
pln->n_pes = n_pes;
pln->my_pe = my_pe;
pln->send_block_sizes = sbs;
pln->send_block_offsets = sbo;
pln->recv_block_sizes = rbs;
pln->recv_block_offsets = rbo;
if (my_pe >= n_pes) {
pln->sched = 0; /* this process is not doing anything */
}
else {
pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
fill1_comm_sched(pln->sched, my_pe, n_pes);
if (sort_pe >= 0)
sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
}
X(ops_zero)(&pln->super.super.ops);
if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
/* FIXME: should MPI operations be counted in "other" somehow? */
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cld3);
X(plan_destroy_internal)(cld2rest);
X(plan_destroy_internal)(cld2);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input)
{
static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
return &(slv->super);
}
void XM(transpose_pairwise_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
REGISTER_SOLVER(p, mksolver(preserve_input));
}

View File

@@ -0,0 +1,123 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-transpose.h"
static void destroy(problem *ego_)
{
problem_mpi_transpose *ego = (problem_mpi_transpose *) ego_;
MPI_Comm_free(&ego->comm);
X(ifree)(ego_);
}
static void hash(const problem *p_, md5 *m)
{
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
int i;
X(md5puts)(m, "mpi-transpose");
X(md5int)(m, p->I == p->O);
/* don't include alignment -- may differ between processes
X(md5int)(m, X(ialignment_of)(p->I));
X(md5int)(m, X(ialignment_of)(p->O));
... note that applicability of MPI plans does not depend
on alignment (although optimality may, in principle). */
X(md5INT)(m, p->vn);
X(md5INT)(m, p->nx);
X(md5INT)(m, p->ny);
X(md5INT)(m, p->block);
X(md5INT)(m, p->tblock);
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
A(XM(md5_equal)(*m, p->comm));
}
static void print(const problem *ego_, printer *p)
{
const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
int i;
MPI_Comm_size(ego->comm, &i);
p->print(p, "(mpi-transpose %d %d %d %D %D %D %D %D %d)",
ego->I == ego->O,
X(ialignment_of)(ego->I),
X(ialignment_of)(ego->O),
ego->vn,
ego->nx, ego->ny,
ego->block, ego->tblock,
i);
}
static void zero(const problem *ego_)
{
const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
R *I = ego->I;
INT i, N = ego->vn * ego->ny;
int my_pe;
MPI_Comm_rank(ego->comm, &my_pe);
N *= XM(block)(ego->nx, ego->block, my_pe);
for (i = 0; i < N; ++i) I[i] = K(0.0);
}
static const problem_adt padt =
{
PROBLEM_MPI_TRANSPOSE,
hash,
zero,
print,
destroy
};
problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
R *I, R *O,
INT block, INT tblock,
MPI_Comm comm,
unsigned flags)
{
problem_mpi_transpose *ego =
(problem_mpi_transpose *)X(mkproblem)(sizeof(problem_mpi_transpose), &padt);
A(nx > 0 && ny > 0 && vn > 0);
A(block > 0 && XM(num_blocks_ok)(nx, block, comm)
&& tblock > 0 && XM(num_blocks_ok)(ny, tblock, comm));
/* enforce pointer equality if untainted pointers are equal */
if (UNTAINT(I) == UNTAINT(O))
I = O = JOIN_TAINT(I, O);
ego->nx = nx;
ego->ny = ny;
ego->vn = vn;
ego->I = I;
ego->O = O;
ego->block = block > nx ? nx : block;
ego->tblock = tblock > ny ? ny : tblock;
/* canonicalize flags: we can freely assume that the data is
"transposed" if one of the dimensions is 1. */
if (ego->block == 1)
flags |= TRANSPOSED_IN;
if (ego->tblock == 1)
flags |= TRANSPOSED_OUT;
ego->flags = flags;
MPI_Comm_dup(comm, &ego->comm);
return &(ego->super);
}

View File

@@ -0,0 +1,300 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* Recursive "radix-r" distributed transpose, which breaks a transpose
over p processes into p/r transposes over r processes plus r
transposes over p/r processes. If performed recursively, this
produces a total of O(p log p) messages vs. O(p^2) messages for a
direct approach.
However, this is not necessarily an improvement. The total size of
all the messages is actually increased from O(N) to O(N log p)
where N is the total data size. Also, the amount of local data
rearrangement is increased. So, it's not clear, a priori, what the
best algorithm will be, and we'll leave it to the planner. (In
theory and practice, it looks like this becomes advantageous for
large p, in the limit where the message sizes are small and
latency-dominated.)
*/
#include "mpi-transpose.h"
#include <string.h>
typedef struct {
solver super;
int (*radix)(int np);
const char *nam;
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
} S;
typedef struct {
plan_mpi_transpose super;
plan *cld1, *cldtr, *cldtm;
int preserve_input;
int r; /* "radix" */
const char *nam;
} P;
static void apply(const plan *ego_, R *I, R *O)
{
const P *ego = (const P *) ego_;
plan_rdft *cld1, *cldtr, *cldtm;
cld1 = (plan_rdft *) ego->cld1;
if (cld1) cld1->apply((plan *) cld1, I, O);
if (ego->preserve_input) I = O;
cldtr = (plan_rdft *) ego->cldtr;
if (cldtr) cldtr->apply((plan *) cldtr, O, I);
cldtm = (plan_rdft *) ego->cldtm;
if (cldtm) cldtm->apply((plan *) cldtm, I, O);
}
static int radix_sqrt(int np)
{
int r;
for (r = (int) (X(isqrt)(np)); np % r != 0; ++r)
;
return r;
}
static int radix_first(int np)
{
int r = (int) (X(first_divisor)(np));
return (r >= (int) (X(isqrt)(np)) ? 0 : r);
}
/* the local allocated space on process pe required for the given transpose
dimensions and block sizes */
static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe)
{
return X(imax)(XM(block)(nx, block, pe) * ny,
nx * XM(block)(ny, tblock, pe));
}
/* check whether the recursive transposes fit within the space
that must have been allocated on each process for this transpose;
this must be modified if the subdivision in mkplan is changed! */
static int enough_space(INT nx, INT ny, INT block, INT tblock,
int r, int n_pes)
{
int pe;
int m = n_pes / r;
for (pe = 0; pe < n_pes; ++pe) {
INT space = transpose_space(nx, ny, block, tblock, pe);
INT b1 = XM(block)(nx, r * block, pe / r);
INT b2 = XM(block)(ny, m * tblock, pe % r);
if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
|| transpose_space(nx, b2, r*block, tblock, pe / r) > space)
return 0;
}
return 1;
}
/* In theory, transpose-recurse becomes advantageous for message sizes
below some minimum, assuming that the time is dominated by
communications. In practice, we want to constrain the minimum
message size for transpose-recurse to keep the planning time down.
I've set this conservatively according to some simple experiments
on a Cray XT3 where the crossover message size was 128, although on
a larger-latency machine the crossover will be larger. */
#define SMALL_MESSAGE 2048
static int applicable(const S *ego, const problem *p_,
const planner *plnr, int *r)
{
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
int n_pes;
MPI_Comm_size(p->comm, &n_pes);
return (1
&& p->tblock * n_pes == p->ny
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
&& p->I != p->O))
&& (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1
&& enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes)
&& (!CONSERVE_MEMORYP(plnr) || *r > 8
|| !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r))
&& (!NO_SLOWP(plnr) ||
(p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE)
&& ONLY_TRANSPOSEDP(p->flags)
);
}
static void awake(plan *ego_, enum wakefulness wakefulness)
{
P *ego = (P *) ego_;
X(plan_awake)(ego->cld1, wakefulness);
X(plan_awake)(ego->cldtr, wakefulness);
X(plan_awake)(ego->cldtm, wakefulness);
}
static void destroy(plan *ego_)
{
P *ego = (P *) ego_;
X(plan_destroy_internal)(ego->cldtm);
X(plan_destroy_internal)(ego->cldtr);
X(plan_destroy_internal)(ego->cld1);
}
static void print(const plan *ego_, printer *p)
{
const P *ego = (const P *) ego_;
p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))",
ego->nam, ego->r, ego->preserve_input==2 ?"/p":"",
ego->cld1, ego->cldtr, ego->cldtm);
}
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *) ego_;
const problem_mpi_transpose *p;
P *pln;
plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
R *I, *O;
int me, np, r, m;
INT b;
MPI_Comm comm2;
static const plan_adt padt = {
XM(transpose_solve), awake, print, destroy
};
UNUSED(ego);
if (!applicable(ego, p_, plnr, &r))
return (plan *) 0;
p = (const problem_mpi_transpose *) p_;
MPI_Comm_size(p->comm, &np);
MPI_Comm_rank(p->comm, &me);
m = np / r;
A(r * m == np);
I = p->I; O = p->O;
b = XM(block)(p->nx, p->block, me);
A(p->tblock * np == p->ny); /* this is currently required for cld1 */
if (p->flags & TRANSPOSED_IN) {
/* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
INT vn = p->vn * b * p->tblock;
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_3d)
(m, r*vn, vn,
r, vn, m*vn,
vn, 1, 1),
I, O),
0, 0, NO_SLOW);
}
else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
/* b x m x r x bt x vn -> r x m x bt x b x vn */
INT vn = p->vn;
INT bt = p->tblock;
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_5d)
(b, m*r*bt*vn, vn,
m, r*bt*vn, bt*b*vn,
r, bt*vn, m*bt*b*vn,
bt, vn, b*vn,
vn, 1, 1),
I, O),
0, 0, NO_SLOW);
}
else { /* TRANSPOSED_IN permutation must be separate for in-place */
/* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
INT vn = p->vn * p->tblock;
cld1 = X(mkplan_f_d)(plnr,
X(mkproblem_rdft_0_d)(X(mktensor_4d)
(m, r*vn, vn,
r, vn, m*vn,
vn, 1, 1,
b, np*vn, np*vn),
I, O),
0, 0, NO_SLOW);
}
if (XM(any_true)(!cld1, p->comm)) goto nada;
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
b = XM(block)(p->nx, r * p->block, me / r);
MPI_Comm_split(p->comm, me / r, me, &comm2);
if (b)
cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
(b, p->ny, p->vn,
O, I, p->block, m * p->tblock, comm2,
p->I != p->O
? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
MPI_Comm_free(&comm2);
if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
b = XM(block)(p->ny, m * p->tblock, me % r);
MPI_Comm_split(p->comm, me % r, me, &comm2);
if (b)
cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
(p->nx, b, p->vn,
I, O, r * p->block, p->tblock, comm2,
TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
MPI_Comm_free(&comm2);
if (XM(any_true)(b && !cldtm, p->comm)) goto nada;
pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
pln->cld1 = cld1;
pln->cldtr = cldtr;
pln->cldtm = cldtm;
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
pln->r = r;
pln->nam = ego->nam;
pln->super.super.ops = cld1->ops;
if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);
return &(pln->super.super);
nada:
X(plan_destroy_internal)(cldtm);
X(plan_destroy_internal)(cldtr);
X(plan_destroy_internal)(cld1);
return (plan *) 0;
}
static solver *mksolver(int preserve_input,
int (*radix)(int np), const char *nam)
{
static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
S *slv = MKSOLVER(S, &sadt);
slv->preserve_input = preserve_input;
slv->radix = radix;
slv->nam = nam;
return &(slv->super);
}
void XM(transpose_recurse_register)(planner *p)
{
int preserve_input;
for (preserve_input = 0; preserve_input <= 1; ++preserve_input) {
REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt"));
REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first"));
}
}

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "mpi-transpose.h"
/* use the apply() operation for MPI_TRANSPOSE problems */
void XM(transpose_solve)(const plan *ego_, const problem *p_)
{
const plan_mpi_transpose *ego = (const plan_mpi_transpose *) ego_;
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
}

View File

@@ -0,0 +1,112 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "fftw3-mpi.h"
#include "ifftw-mpi.h"
#include <string.h>
#if SIZEOF_SIZE_T == SIZEOF_UNSIGNED_INT
# define FFTW_MPI_SIZE_T MPI_UNSIGNED
#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG
# define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG_LONG
# define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG_LONG
#else
# error MPI type for size_t is unknown
# define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
#endif
/* Import wisdom from all processes to process 0, as prelude to
exporting a single wisdom file (this is convenient when we are
running on identical processors, to avoid the annoyance of having
per-process wisdom files). In order to make the time for this
operation logarithmic in the number of processors (rather than
linear), we employ a tree reduction algorithm. This means that the
wisdom is modified on processes other than root, which shouldn't
matter in practice. */
void XM(gather_wisdom)(MPI_Comm comm_)
{
MPI_Comm comm, comm2;
int my_pe, n_pes;
char *wis;
size_t wislen;
MPI_Status status;
MPI_Comm_dup(comm_, &comm);
MPI_Comm_rank(comm, &my_pe);
MPI_Comm_size(comm, &n_pes);
if (n_pes > 2) { /* recursively split into even/odd processes */
MPI_Comm_split(comm, my_pe % 2, my_pe, &comm2);
XM(gather_wisdom)(comm2);
MPI_Comm_free(&comm2);
}
if (n_pes > 1 && my_pe < 2) { /* import process 1 -> 0 */
if (my_pe == 1) {
wis = X(export_wisdom_to_string)();
wislen = strlen(wis) + 1;
MPI_Send(&wislen, 1, FFTW_MPI_SIZE_T, 0, 111, comm);
MPI_Send(wis, wislen, MPI_CHAR, 0, 222, comm);
free(wis);
}
else /* my_pe == 0 */ {
MPI_Recv(&wislen, 1, FFTW_MPI_SIZE_T, 1, 111, comm, &status);
wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
MPI_Recv(wis, wislen, MPI_CHAR, 1, 222, comm, &status);
if (!X(import_wisdom_from_string)(wis))
MPI_Abort(comm, 1);
X(ifree)(wis);
}
}
MPI_Comm_free(&comm);
}
/* broadcast wisdom from process 0 to all other processes; this
is useful so that we can import wisdom once and not worry
about parallel I/O or process-specific wisdom, although of
course it assumes that all the processes have identical
performance characteristics (i.e. identical hardware). */
void XM(broadcast_wisdom)(MPI_Comm comm_)
{
MPI_Comm comm;
int my_pe;
char *wis;
size_t wislen;
MPI_Comm_dup(comm_, &comm);
MPI_Comm_rank(comm, &my_pe);
if (my_pe != 0) {
MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
if (!X(import_wisdom_from_string)(wis))
MPI_Abort(comm, 1);
X(ifree)(wis);
}
else /* my_pe == 0 */ {
wis = X(export_wisdom_to_string)();
wislen = strlen(wis) + 1;
MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
X(free)(wis);
}
MPI_Comm_free(&comm);
}