Updates
This commit is contained in:
101
fftw-3.3.10/mpi/Makefile.am
Normal file
101
fftw-3.3.10/mpi/Makefile.am
Normal file
@@ -0,0 +1,101 @@
|
||||
# -I $(top_srcdir)/api is necessary because fftw3-mpi.h includes
|
||||
# "fftw3.h", and we cannot change the latter to "api/fftw3.h" because
|
||||
# fftw3-mpi.h is installed in /usr/include.
|
||||
AM_CPPFLAGS = -I $(top_srcdir) -I $(top_srcdir)/api
|
||||
|
||||
if MPI
|
||||
lib_LTLIBRARIES = libfftw3@PREC_SUFFIX@_mpi.la
|
||||
include_HEADERS = fftw3-mpi.h
|
||||
nodist_include_HEADERS = fftw3-mpi.f03 fftw3l-mpi.f03
|
||||
noinst_PROGRAMS = mpi-bench
|
||||
endif
|
||||
|
||||
CC=@MPICC@
|
||||
|
||||
EXTRA_DIST = testsched.c f03api.sh f03-wrap.sh genf03-wrap.pl fftw3-mpi.f03.in fftw3l-mpi.f03.in
|
||||
BUILT_SOURCES = fftw3-mpi.f03.in fftw3-mpi.f03 fftw3l-mpi.f03.in fftw3l-mpi.f03 f03-wrap.c
|
||||
CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03
|
||||
|
||||
TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
|
||||
DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
|
||||
RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
|
||||
RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
|
||||
SRC = any-true.c api.c block.c choose-radix.c conf.c dtensor.c fftw3-mpi.h ifftw-mpi.h rearrange.c wisdom-api.c f03-wrap.c
|
||||
|
||||
libfftw3@PREC_SUFFIX@_mpi_la_SOURCES = $(SRC) $(TRANSPOSE_SRC) $(DFT_SRC) $(RDFT_SRC) $(RDFT2_SRC)
|
||||
|
||||
libfftw3@PREC_SUFFIX@_mpi_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@
|
||||
libfftw3@PREC_SUFFIX@_mpi_la_LIBADD = ../libfftw3@PREC_SUFFIX@.la @MPILIBS@
|
||||
|
||||
if THREADS
|
||||
mpi_bench_CFLAGS = $(PTHREAD_CFLAGS)
|
||||
if !COMBINED_THREADS
|
||||
LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_threads.la
|
||||
endif
|
||||
else
|
||||
if OPENMP
|
||||
mpi_bench_CFLAGS = $(OPENMP_CFLAGS)
|
||||
LIBFFTWTHREADS = $(top_builddir)/threads/libfftw3@PREC_SUFFIX@_omp.la
|
||||
endif
|
||||
endif
|
||||
|
||||
mpi_bench_SOURCES = mpi-bench.c $(top_srcdir)/tests/fftw-bench.c $(top_srcdir)/tests/hook.c
|
||||
mpi_bench_LDADD = libfftw3@PREC_SUFFIX@_mpi.la $(LIBFFTWTHREADS) $(top_builddir)/libfftw3@PREC_SUFFIX@.la $(top_builddir)/libbench2/libbench2.a $(MPILIBS) $(THREADLIBS)
|
||||
|
||||
CHECK = $(top_srcdir)/tests/check.pl
|
||||
NUMCHECK=10
|
||||
CHECKSIZE=10000
|
||||
CHECKOPTS = --verbose --random --maxsize=$(CHECKSIZE) -c=$(NUMCHECK) $(CHECK_PL_OPTS)
|
||||
|
||||
if MPI
|
||||
|
||||
check-local: mpi-bench$(EXEEXT)
|
||||
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 1 `pwd`/mpi-bench"
|
||||
@echo "--------------------------------------------------------------"
|
||||
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 1 CPU"
|
||||
@echo "--------------------------------------------------------------"
|
||||
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 2 `pwd`/mpi-bench"
|
||||
@echo "--------------------------------------------------------------"
|
||||
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 2 CPUs"
|
||||
@echo "--------------------------------------------------------------"
|
||||
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 3 `pwd`/mpi-bench"
|
||||
@echo "--------------------------------------------------------------"
|
||||
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 3 CPUs"
|
||||
@echo "--------------------------------------------------------------"
|
||||
perl -w $(CHECK) $(CHECKOPTS) --mpi "$(MPIRUN) -np 4 `pwd`/mpi-bench"
|
||||
@echo "--------------------------------------------------------------"
|
||||
@echo " MPI FFTW transforms passed "$(NUMCHECK)" tests, 4 CPUs"
|
||||
@echo "--------------------------------------------------------------"
|
||||
if SMP
|
||||
perl -w $(CHECK) $(CHECKOPTS) --mpi --nthreads=2 "$(MPIRUN) -np 3 `pwd`/mpi-bench"
|
||||
@echo "--------------------------------------------------------------"
|
||||
@echo " MPI FFTW threaded transforms passed "$(NUMCHECK)" tests!"
|
||||
@echo "--------------------------------------------------------------"
|
||||
endif
|
||||
|
||||
bigcheck: mpi-bench$(EXEEXT)
|
||||
$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=100 CHECKSIZE=60000 check-local
|
||||
|
||||
smallcheck: mpi-bench$(EXEEXT)
|
||||
$(MAKE) $(AM_MAKEFLAGS) NUMCHECK=2 check-local
|
||||
|
||||
endif
|
||||
|
||||
fftw3-mpi.f03: fftw3-mpi.f03.in Makefile
|
||||
sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3-mpi.f03.in > $@
|
||||
|
||||
fftw3l-mpi.f03: fftw3l-mpi.f03.in Makefile
|
||||
sed 's/C_MPI_FINT/@C_MPI_FINT@/' $(srcdir)/fftw3l-mpi.f03.in > $@
|
||||
|
||||
if MAINTAINER_MODE
|
||||
|
||||
fftw3-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
|
||||
sh $(srcdir)/f03api.sh d f > $@
|
||||
|
||||
fftw3l-mpi.f03.in: fftw3-mpi.h f03api.sh $(top_srcdir)/api/genf03.pl
|
||||
sh $(srcdir)/f03api.sh l | grep -v parameter | sed 's/fftw3.f03/fftw3l.f03/' > $@
|
||||
|
||||
f03-wrap.c: fftw3-mpi.h f03-wrap.sh genf03-wrap.pl
|
||||
sh $(srcdir)/f03-wrap.sh > $@
|
||||
|
||||
endif
|
||||
1037
fftw-3.3.10/mpi/Makefile.in
Normal file
1037
fftw-3.3.10/mpi/Makefile.in
Normal file
File diff suppressed because it is too large
Load Diff
59
fftw-3.3.10/mpi/any-true.c
Normal file
59
fftw-3.3.10/mpi/any-true.c
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* During planning, if any process fails to create a plan then
|
||||
all of the processes must fail. This synchronization is implemented
|
||||
by the following routine.
|
||||
|
||||
Instead of
|
||||
if (failure) goto nada;
|
||||
we instead do:
|
||||
if (any_true(failure, comm)) goto nada;
|
||||
*/
|
||||
|
||||
int XM(any_true)(int condition, MPI_Comm comm)
|
||||
{
|
||||
int result;
|
||||
MPI_Allreduce(&condition, &result, 1, MPI_INT, MPI_LOR, comm);
|
||||
return result;
|
||||
}
|
||||
|
||||
/***********************************************************************/
|
||||
|
||||
#if defined(FFTW_DEBUG)
|
||||
/* for debugging, we include an assertion to make sure that
|
||||
MPI problems all produce equal hashes, as checked by this routine: */
|
||||
|
||||
int XM(md5_equal)(md5 m, MPI_Comm comm)
|
||||
{
|
||||
unsigned long s0[4];
|
||||
int i, eq_me, eq_all;
|
||||
|
||||
X(md5end)(&m);
|
||||
for (i = 0; i < 4; ++i) s0[i] = m.s[i];
|
||||
MPI_Bcast(s0, 4, MPI_UNSIGNED_LONG, 0, comm);
|
||||
for (i = 0; i < 4 && s0[i] == m.s[i]; ++i) ;
|
||||
eq_me = i == 4;
|
||||
MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
|
||||
return eq_all;
|
||||
}
|
||||
#endif
|
||||
907
fftw-3.3.10/mpi/api.c
Normal file
907
fftw-3.3.10/mpi/api.c
Normal file
@@ -0,0 +1,907 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "api/api.h"
|
||||
#include "fftw3-mpi.h"
|
||||
#include "ifftw-mpi.h"
|
||||
#include "mpi-transpose.h"
|
||||
#include "mpi-dft.h"
|
||||
#include "mpi-rdft.h"
|
||||
#include "mpi-rdft2.h"
|
||||
|
||||
/* Convert API flags to internal MPI flags. */
|
||||
#define MPI_FLAGS(f) ((f) >> 27)
|
||||
|
||||
/*************************************************************************/
|
||||
|
||||
static int mpi_inited = 0;
|
||||
|
||||
static MPI_Comm problem_comm(const problem *p) {
|
||||
switch (p->adt->problem_kind) {
|
||||
case PROBLEM_MPI_DFT:
|
||||
return ((const problem_mpi_dft *) p)->comm;
|
||||
case PROBLEM_MPI_RDFT:
|
||||
return ((const problem_mpi_rdft *) p)->comm;
|
||||
case PROBLEM_MPI_RDFT2:
|
||||
return ((const problem_mpi_rdft2 *) p)->comm;
|
||||
case PROBLEM_MPI_TRANSPOSE:
|
||||
return ((const problem_mpi_transpose *) p)->comm;
|
||||
default:
|
||||
return MPI_COMM_NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* used to synchronize cost measurements (timing or estimation)
|
||||
across all processes for an MPI problem, which is critical to
|
||||
ensure that all processes decide to use the same MPI plans
|
||||
(whereas serial plans need not be syncronized). */
|
||||
static double cost_hook(const problem *p, double t, cost_kind k)
|
||||
{
|
||||
MPI_Comm comm = problem_comm(p);
|
||||
double tsum;
|
||||
if (comm == MPI_COMM_NULL) return t;
|
||||
MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE,
|
||||
k == COST_SUM ? MPI_SUM : MPI_MAX, comm);
|
||||
return tsum;
|
||||
}
|
||||
|
||||
/* Used to reject wisdom that is not in sync across all processes
|
||||
for an MPI problem, which is critical to ensure that all processes
|
||||
decide to use the same MPI plans. (Even though costs are synchronized,
|
||||
above, out-of-sync wisdom may result from plans being produced
|
||||
by communicators that do not span all processes, either from a
|
||||
user-specified communicator or e.g. from transpose-recurse. */
|
||||
static int wisdom_ok_hook(const problem *p, flags_t flags)
|
||||
{
|
||||
MPI_Comm comm = problem_comm(p);
|
||||
int eq_me, eq_all;
|
||||
/* unpack flags bitfield, since MPI communications may involve
|
||||
byte-order changes and MPI cannot do this for bit fields */
|
||||
#if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */
|
||||
unsigned int f[5];
|
||||
#else
|
||||
unsigned long f[5]; /* at least 32 bits as per C standard */
|
||||
#endif
|
||||
|
||||
if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */
|
||||
|
||||
if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */
|
||||
|
||||
/* otherwise, check that the flags and solver index are identical
|
||||
on all processes in this problem's communicator.
|
||||
|
||||
TO DO: possibly we can relax strict equality, but it is
|
||||
critical to ensure that any flags which affect what plan is
|
||||
created (and whether the solver is applicable) are the same,
|
||||
e.g. DESTROY_INPUT, NO_UGLY, etcetera. (If the MPI algorithm
|
||||
differs between processes, deadlocks/crashes generally result.) */
|
||||
f[0] = flags.l;
|
||||
f[1] = flags.hash_info;
|
||||
f[2] = flags.timelimit_impatience;
|
||||
f[3] = flags.u;
|
||||
f[4] = flags.slvndx;
|
||||
MPI_Bcast(f, 5,
|
||||
SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG,
|
||||
0, comm);
|
||||
eq_me = f[0] == flags.l && f[1] == flags.hash_info
|
||||
&& f[2] == flags.timelimit_impatience
|
||||
&& f[3] == flags.u && f[4] == flags.slvndx;
|
||||
MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
|
||||
return eq_all;
|
||||
}
|
||||
|
||||
/* This hook is called when wisdom is not found. The any_true here
|
||||
matches up with the any_true in wisdom_ok_hook, in order to handle
|
||||
the case where some processes had wisdom (and called wisdom_ok_hook)
|
||||
and some processes didn't have wisdom (and called nowisdom_hook). */
|
||||
static void nowisdom_hook(const problem *p)
|
||||
{
|
||||
MPI_Comm comm = problem_comm(p);
|
||||
if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */
|
||||
XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */
|
||||
}
|
||||
|
||||
/* needed to synchronize planner bogosity flag, in case non-MPI problems
|
||||
on a subset of processes encountered bogus wisdom */
|
||||
static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p)
|
||||
{
|
||||
MPI_Comm comm = problem_comm(p);
|
||||
if (comm != MPI_COMM_NULL /* an MPI problem */
|
||||
&& XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */
|
||||
return WISDOM_IS_BOGUS;
|
||||
return state;
|
||||
}
|
||||
|
||||
void XM(init)(void)
|
||||
{
|
||||
if (!mpi_inited) {
|
||||
planner *plnr = X(the_planner)();
|
||||
plnr->cost_hook = cost_hook;
|
||||
plnr->wisdom_ok_hook = wisdom_ok_hook;
|
||||
plnr->nowisdom_hook = nowisdom_hook;
|
||||
plnr->bogosity_hook = bogosity_hook;
|
||||
XM(conf_standard)(plnr);
|
||||
mpi_inited = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void XM(cleanup)(void)
|
||||
{
|
||||
X(cleanup)();
|
||||
mpi_inited = 0;
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
|
||||
static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0)
|
||||
{
|
||||
dtensor *x = XM(mkdtensor)(rnk);
|
||||
int i;
|
||||
for (i = 0; i < rnk; ++i) {
|
||||
x->dims[i].n = dims0[i].n;
|
||||
x->dims[i].b[IB] = dims0[i].ib;
|
||||
x->dims[i].b[OB] = dims0[i].ob;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes,
|
||||
int rdft2)
|
||||
{
|
||||
dtensor *sz = XM(mkdtensor)(rnk);
|
||||
dtensor *sz0 = mkdtensor_api(rnk, dims0);
|
||||
block_kind k;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < rnk; ++i)
|
||||
sz->dims[i].n = dims0[i].n;
|
||||
|
||||
if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
|
||||
|
||||
for (i = 0; i < rnk; ++i) {
|
||||
sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n;
|
||||
sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n;
|
||||
}
|
||||
|
||||
/* If we haven't used all of the processes yet, and some of the
|
||||
block sizes weren't specified (i.e. 0), then set the
|
||||
unspecified blocks so as to use as many processes as
|
||||
possible with as few distributed dimensions as possible. */
|
||||
FORALL_BLOCK_KIND(k) {
|
||||
INT nb = XM(num_blocks_total)(sz, k);
|
||||
INT np = n_pes / nb;
|
||||
for (i = 0; i < rnk && np > 1; ++i)
|
||||
if (!sz0->dims[i].b[k]) {
|
||||
sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np);
|
||||
nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]);
|
||||
np = n_pes / nb;
|
||||
}
|
||||
}
|
||||
|
||||
if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n;
|
||||
|
||||
/* punt for 1d prime */
|
||||
if (rnk == 1 && X(is_prime)(sz->dims[0].n))
|
||||
sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n;
|
||||
|
||||
XM(dtensor_destroy)(sz0);
|
||||
sz0 = XM(dtensor_canonical)(sz, 0);
|
||||
XM(dtensor_destroy)(sz);
|
||||
return sz0;
|
||||
}
|
||||
|
||||
/* allocate simple local (serial) dims array corresponding to n[rnk] */
|
||||
static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n)
|
||||
{
|
||||
XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk,
|
||||
TENSORS);
|
||||
int i;
|
||||
for (i = 0; i < rnk; ++i)
|
||||
dims[i].n = dims[i].ib = dims[i].ob = n[i];
|
||||
return dims;
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
|
||||
static void local_size(int my_pe, const dtensor *sz, block_kind k,
|
||||
ptrdiff_t *local_n, ptrdiff_t *local_start)
|
||||
{
|
||||
int i;
|
||||
if (my_pe >= XM(num_blocks_total)(sz, k))
|
||||
for (i = 0; i < sz->rnk; ++i)
|
||||
local_n[i] = local_start[i] = 0;
|
||||
else {
|
||||
XM(block_coords)(sz, k, my_pe, local_start);
|
||||
for (i = 0; i < sz->rnk; ++i) {
|
||||
local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k],
|
||||
local_start[i]);
|
||||
local_start[i] *= sz->dims[i].b[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static INT prod(int rnk, const ptrdiff_t *local_n)
|
||||
{
|
||||
int i;
|
||||
INT N = 1;
|
||||
for (i = 0; i < rnk; ++i) N *= local_n[i];
|
||||
return N;
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0,
|
||||
ptrdiff_t howmany, MPI_Comm comm,
|
||||
ptrdiff_t *local_n_in,
|
||||
ptrdiff_t *local_start_in,
|
||||
ptrdiff_t *local_n_out,
|
||||
ptrdiff_t *local_start_out,
|
||||
int sign, unsigned flags)
|
||||
{
|
||||
INT N;
|
||||
int my_pe, n_pes, i;
|
||||
dtensor *sz;
|
||||
|
||||
if (rnk == 0)
|
||||
return howmany;
|
||||
|
||||
MPI_Comm_rank(comm, &my_pe);
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
sz = default_sz(rnk, dims0, n_pes, 0);
|
||||
|
||||
/* Now, we must figure out how much local space the user should
|
||||
allocate (or at least an upper bound). This depends strongly
|
||||
on the exact algorithms we employ...ugh! FIXME: get this info
|
||||
from the solvers somehow? */
|
||||
N = 1; /* never return zero allocation size */
|
||||
if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) {
|
||||
INT Nafter;
|
||||
ddim odims[2];
|
||||
|
||||
/* dft-rank-geq2-transposed */
|
||||
odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */
|
||||
/* we may need extra space for transposed intermediate data */
|
||||
for (i = 0; i < 2; ++i)
|
||||
if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 &&
|
||||
XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) {
|
||||
sz->dims[i].b[IB]
|
||||
= XM(default_block)(sz->dims[i].n, n_pes);
|
||||
sz->dims[1-i].b[IB] = sz->dims[1-i].n;
|
||||
local_size(my_pe, sz, IB, local_n_in, local_start_in);
|
||||
N = X(imax)(N, prod(rnk, local_n_in));
|
||||
sz->dims[i] = odims[i];
|
||||
sz->dims[1-i] = odims[1-i];
|
||||
break;
|
||||
}
|
||||
|
||||
/* dft-rank-geq2 */
|
||||
Nafter = howmany;
|
||||
for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
|
||||
N = X(imax)(N, (sz->dims[0].n
|
||||
* XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
|
||||
my_pe) + howmany - 1) / howmany);
|
||||
|
||||
/* dft-rank-geq2 with dimensions swapped */
|
||||
Nafter = howmany * sz->dims[0].n;
|
||||
for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
|
||||
N = X(imax)(N, (sz->dims[1].n
|
||||
* XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
|
||||
my_pe) + howmany - 1) / howmany);
|
||||
}
|
||||
else if (rnk == 1) {
|
||||
if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */
|
||||
ptrdiff_t n[2], start[2];
|
||||
dtensor *sz2 = XM(mkdtensor)(2);
|
||||
sz2->dims[0] = sz->dims[0];
|
||||
sz2->dims[0].b[IB] = sz->dims[0].n;
|
||||
sz2->dims[1].n = sz2->dims[1].b[OB] = howmany;
|
||||
sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes);
|
||||
local_size(my_pe, sz2, IB, n, start);
|
||||
XM(dtensor_destroy)(sz2);
|
||||
N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany);
|
||||
}
|
||||
else { /* dft-rank1 */
|
||||
INT r, m, rblock[2], mblock[2];
|
||||
|
||||
/* Since the 1d transforms are so different, we require
|
||||
the user to call local_size_1d for this case. Ugh. */
|
||||
CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD);
|
||||
|
||||
if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign,
|
||||
rblock, mblock))) {
|
||||
m = sz->dims[0].n / r;
|
||||
if (flags & FFTW_MPI_SCRAMBLED_IN)
|
||||
sz->dims[0].b[IB] = rblock[IB] * m;
|
||||
else { /* !SCRAMBLED_IN */
|
||||
sz->dims[0].b[IB] = r * mblock[IB];
|
||||
N = X(imax)(N, rblock[IB] * m);
|
||||
}
|
||||
if (flags & FFTW_MPI_SCRAMBLED_OUT)
|
||||
sz->dims[0].b[OB] = r * mblock[OB];
|
||||
else { /* !SCRAMBLED_OUT */
|
||||
N = X(imax)(N, r * mblock[OB]);
|
||||
sz->dims[0].b[OB] = rblock[OB] * m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
local_size(my_pe, sz, IB, local_n_in, local_start_in);
|
||||
local_size(my_pe, sz, OB, local_n_out, local_start_out);
|
||||
|
||||
/* at least, make sure we have enough space to store input & output */
|
||||
N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out)));
|
||||
|
||||
XM(dtensor_destroy)(sz);
|
||||
return N * howmany;
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t xblock, ptrdiff_t yblock,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx,
|
||||
ptrdiff_t *local_x_start,
|
||||
ptrdiff_t *local_ny,
|
||||
ptrdiff_t *local_y_start)
|
||||
{
|
||||
ptrdiff_t N;
|
||||
XM(ddim) *dims;
|
||||
ptrdiff_t *local;
|
||||
|
||||
if (rnk == 0) {
|
||||
*local_nx = *local_ny = 1;
|
||||
*local_x_start = *local_y_start = 0;
|
||||
return howmany;
|
||||
}
|
||||
|
||||
dims = simple_dims(rnk, n);
|
||||
local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS);
|
||||
|
||||
/* default 1d block distribution, with transposed output
|
||||
if yblock < n[1] */
|
||||
dims[0].ib = xblock;
|
||||
if (rnk > 1) {
|
||||
if (yblock < n[1])
|
||||
dims[1].ob = yblock;
|
||||
else
|
||||
dims[0].ob = xblock;
|
||||
}
|
||||
else
|
||||
dims[0].ob = xblock; /* FIXME: 1d not really supported here
|
||||
since we don't have flags/sign */
|
||||
|
||||
N = XM(local_size_guru)(rnk, dims, howmany, comm,
|
||||
local, local + rnk,
|
||||
local + 2*rnk, local + 3*rnk,
|
||||
0, 0);
|
||||
*local_nx = local[0];
|
||||
*local_x_start = local[rnk];
|
||||
if (rnk > 1) {
|
||||
*local_ny = local[2*rnk + 1];
|
||||
*local_y_start = local[3*rnk + 1];
|
||||
}
|
||||
else {
|
||||
*local_ny = *local_nx;
|
||||
*local_y_start = *local_x_start;
|
||||
}
|
||||
X(ifree)(local);
|
||||
X(ifree)(dims);
|
||||
return N;
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t xblock,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx,
|
||||
ptrdiff_t *local_x_start)
|
||||
{
|
||||
ptrdiff_t local_ny, local_y_start;
|
||||
return XM(local_size_many_transposed)(rnk, n, howmany,
|
||||
xblock, rnk > 1
|
||||
? n[1] : FFTW_MPI_DEFAULT_BLOCK,
|
||||
comm,
|
||||
local_nx, local_x_start,
|
||||
&local_ny, &local_y_start);
|
||||
}
|
||||
|
||||
|
||||
ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx,
|
||||
ptrdiff_t *local_x_start,
|
||||
ptrdiff_t *local_ny,
|
||||
ptrdiff_t *local_y_start)
|
||||
{
|
||||
return XM(local_size_many_transposed)(rnk, n, 1,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
comm,
|
||||
local_nx, local_x_start,
|
||||
local_ny, local_y_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx,
|
||||
ptrdiff_t *local_x_start)
|
||||
{
|
||||
return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm,
|
||||
local_nx, local_x_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany,
|
||||
MPI_Comm comm, int sign, unsigned flags,
|
||||
ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
|
||||
ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
|
||||
{
|
||||
XM(ddim) d;
|
||||
d.n = nx;
|
||||
d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK;
|
||||
return XM(local_size_guru)(1, &d, howmany, comm,
|
||||
local_nx, local_x_start,
|
||||
local_ny, local_y_start, sign, flags);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_1d)(ptrdiff_t nx,
|
||||
MPI_Comm comm, int sign, unsigned flags,
|
||||
ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
|
||||
ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
|
||||
{
|
||||
return XM(local_size_many_1d)(nx, 1, comm, sign, flags,
|
||||
local_nx, local_x_start,
|
||||
local_ny, local_y_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx,
|
||||
ptrdiff_t *local_x_start,
|
||||
ptrdiff_t *local_ny,
|
||||
ptrdiff_t *local_y_start)
|
||||
{
|
||||
ptrdiff_t n[2];
|
||||
n[0] = nx; n[1] = ny;
|
||||
return XM(local_size_transposed)(2, n, comm,
|
||||
local_nx, local_x_start,
|
||||
local_ny, local_y_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm,
|
||||
ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
|
||||
{
|
||||
ptrdiff_t n[2];
|
||||
n[0] = nx; n[1] = ny;
|
||||
return XM(local_size)(2, n, comm, local_nx, local_x_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
|
||||
ptrdiff_t nz,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx,
|
||||
ptrdiff_t *local_x_start,
|
||||
ptrdiff_t *local_ny,
|
||||
ptrdiff_t *local_y_start)
|
||||
{
|
||||
ptrdiff_t n[3];
|
||||
n[0] = nx; n[1] = ny; n[2] = nz;
|
||||
return XM(local_size_transposed)(3, n, comm,
|
||||
local_nx, local_x_start,
|
||||
local_ny, local_y_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
||||
MPI_Comm comm,
|
||||
ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
|
||||
{
|
||||
ptrdiff_t n[3];
|
||||
n[0] = nx; n[1] = ny; n[2] = nz;
|
||||
return XM(local_size)(3, n, comm, local_nx, local_x_start);
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Transpose API */
|
||||
|
||||
X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t xblock, ptrdiff_t yblock,
|
||||
R *in, R *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
int n_pes;
|
||||
XM(init)();
|
||||
|
||||
if (howmany < 0 || xblock < 0 || yblock < 0 ||
|
||||
nx <= 0 || ny <= 0) return 0;
|
||||
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
if (!xblock) xblock = XM(default_block)(nx, n_pes);
|
||||
if (!yblock) yblock = XM(default_block)(ny, n_pes);
|
||||
if (n_pes < XM(num_blocks)(nx, xblock)
|
||||
|| n_pes < XM(num_blocks)(ny, yblock))
|
||||
return 0;
|
||||
|
||||
return
|
||||
X(mkapiplan)(FFTW_FORWARD, flags,
|
||||
XM(mkproblem_transpose)(nx, ny, howmany,
|
||||
in, out, xblock, yblock,
|
||||
comm, MPI_FLAGS(flags)));
|
||||
}
|
||||
|
||||
X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
|
||||
{
|
||||
return XM(plan_many_transpose)(nx, ny, 1,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
in, out, comm, flags);
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* Complex DFT API */
|
||||
|
||||
X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0,
|
||||
ptrdiff_t howmany,
|
||||
C *in, C *out,
|
||||
MPI_Comm comm, int sign, unsigned flags)
|
||||
{
|
||||
int n_pes, i;
|
||||
dtensor *sz;
|
||||
|
||||
XM(init)();
|
||||
|
||||
if (howmany < 0 || rnk < 1) return 0;
|
||||
for (i = 0; i < rnk; ++i)
|
||||
if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
|
||||
return 0;
|
||||
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
sz = default_sz(rnk, dims0, n_pes, 0);
|
||||
|
||||
if (XM(num_blocks_total)(sz, IB) > n_pes
|
||||
|| XM(num_blocks_total)(sz, OB) > n_pes) {
|
||||
XM(dtensor_destroy)(sz);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return
|
||||
X(mkapiplan)(sign, flags,
|
||||
XM(mkproblem_dft_d)(sz, howmany,
|
||||
(R *) in, (R *) out,
|
||||
comm, sign,
|
||||
MPI_FLAGS(flags)));
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t iblock, ptrdiff_t oblock,
|
||||
C *in, C *out,
|
||||
MPI_Comm comm, int sign, unsigned flags)
|
||||
{
|
||||
XM(ddim) *dims = simple_dims(rnk, n);
|
||||
X(plan) pln;
|
||||
|
||||
if (rnk == 1) {
|
||||
dims[0].ib = iblock;
|
||||
dims[0].ob = oblock;
|
||||
}
|
||||
else if (rnk > 1) {
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
||||
}
|
||||
|
||||
pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags);
|
||||
X(ifree)(dims);
|
||||
return pln;
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out,
|
||||
MPI_Comm comm, int sign, unsigned flags)
|
||||
{
|
||||
return XM(plan_many_dft)(rnk, n, 1,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
in, out, comm, sign, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out,
|
||||
MPI_Comm comm, int sign, unsigned flags)
|
||||
{
|
||||
return XM(plan_dft)(1, &nx, in, out, comm, sign, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out,
|
||||
MPI_Comm comm, int sign, unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[2];
|
||||
n[0] = nx; n[1] = ny;
|
||||
return XM(plan_dft)(2, n, in, out, comm, sign, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
||||
C *in, C *out,
|
||||
MPI_Comm comm, int sign, unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[3];
|
||||
n[0] = nx; n[1] = ny; n[2] = nz;
|
||||
return XM(plan_dft)(3, n, in, out, comm, sign, flags);
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* R2R API */
|
||||
|
||||
X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0,
|
||||
ptrdiff_t howmany,
|
||||
R *in, R *out,
|
||||
MPI_Comm comm, const X(r2r_kind) *kind,
|
||||
unsigned flags)
|
||||
{
|
||||
int n_pes, i;
|
||||
dtensor *sz;
|
||||
rdft_kind *k;
|
||||
X(plan) pln;
|
||||
|
||||
XM(init)();
|
||||
|
||||
if (howmany < 0 || rnk < 1) return 0;
|
||||
for (i = 0; i < rnk; ++i)
|
||||
if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
|
||||
return 0;
|
||||
|
||||
k = X(map_r2r_kind)(rnk, kind);
|
||||
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
sz = default_sz(rnk, dims0, n_pes, 0);
|
||||
|
||||
if (XM(num_blocks_total)(sz, IB) > n_pes
|
||||
|| XM(num_blocks_total)(sz, OB) > n_pes) {
|
||||
XM(dtensor_destroy)(sz);
|
||||
return 0;
|
||||
}
|
||||
|
||||
pln = X(mkapiplan)(0, flags,
|
||||
XM(mkproblem_rdft_d)(sz, howmany,
|
||||
in, out,
|
||||
comm, k, MPI_FLAGS(flags)));
|
||||
X(ifree0)(k);
|
||||
return pln;
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t iblock, ptrdiff_t oblock,
|
||||
R *in, R *out,
|
||||
MPI_Comm comm, const X(r2r_kind) *kind,
|
||||
unsigned flags)
|
||||
{
|
||||
XM(ddim) *dims = simple_dims(rnk, n);
|
||||
X(plan) pln;
|
||||
|
||||
if (rnk == 1) {
|
||||
dims[0].ib = iblock;
|
||||
dims[0].ob = oblock;
|
||||
}
|
||||
else if (rnk > 1) {
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
||||
}
|
||||
|
||||
pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags);
|
||||
X(ifree)(dims);
|
||||
return pln;
|
||||
}
|
||||
|
||||
X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out,
|
||||
MPI_Comm comm,
|
||||
const X(r2r_kind) *kind,
|
||||
unsigned flags)
|
||||
{
|
||||
return XM(plan_many_r2r)(rnk, n, 1,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
in, out, comm, kind, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
|
||||
MPI_Comm comm,
|
||||
X(r2r_kind) kindx, X(r2r_kind) kindy,
|
||||
unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[2];
|
||||
X(r2r_kind) kind[2];
|
||||
n[0] = nx; n[1] = ny;
|
||||
kind[0] = kindx; kind[1] = kindy;
|
||||
return XM(plan_r2r)(2, n, in, out, comm, kind, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
||||
R *in, R *out,
|
||||
MPI_Comm comm,
|
||||
X(r2r_kind) kindx, X(r2r_kind) kindy,
|
||||
X(r2r_kind) kindz,
|
||||
unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[3];
|
||||
X(r2r_kind) kind[3];
|
||||
n[0] = nx; n[1] = ny; n[2] = nz;
|
||||
kind[0] = kindx; kind[1] = kindy; kind[2] = kindz;
|
||||
return XM(plan_r2r)(3, n, in, out, comm, kind, flags);
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* R2C/C2R API */
|
||||
|
||||
static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0,
|
||||
ptrdiff_t howmany,
|
||||
R *r, C *c,
|
||||
MPI_Comm comm, rdft_kind kind, unsigned flags)
|
||||
{
|
||||
int n_pes, i;
|
||||
dtensor *sz;
|
||||
R *cr = (R *) c;
|
||||
|
||||
XM(init)();
|
||||
|
||||
if (howmany < 0 || rnk < 2) return 0;
|
||||
for (i = 0; i < rnk; ++i)
|
||||
if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
|
||||
return 0;
|
||||
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
sz = default_sz(rnk, dims0, n_pes, 1);
|
||||
|
||||
sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
|
||||
if (XM(num_blocks_total)(sz, IB) > n_pes
|
||||
|| XM(num_blocks_total)(sz, OB) > n_pes) {
|
||||
XM(dtensor_destroy)(sz);
|
||||
return 0;
|
||||
}
|
||||
sz->dims[rnk-1].n = dims0[rnk-1].n;
|
||||
|
||||
if (kind == R2HC)
|
||||
return X(mkapiplan)(0, flags,
|
||||
XM(mkproblem_rdft2_d)(sz, howmany,
|
||||
r, cr, comm, R2HC,
|
||||
MPI_FLAGS(flags)));
|
||||
else
|
||||
return X(mkapiplan)(0, flags,
|
||||
XM(mkproblem_rdft2_d)(sz, howmany,
|
||||
cr, r, comm, HC2R,
|
||||
MPI_FLAGS(flags)));
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t iblock, ptrdiff_t oblock,
|
||||
R *in, C *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
XM(ddim) *dims = simple_dims(rnk, n);
|
||||
X(plan) pln;
|
||||
|
||||
if (rnk == 1) {
|
||||
dims[0].ib = iblock;
|
||||
dims[0].ob = oblock;
|
||||
}
|
||||
else if (rnk > 1) {
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
||||
}
|
||||
|
||||
pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags);
|
||||
X(ifree)(dims);
|
||||
return pln;
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n,
|
||||
ptrdiff_t howmany,
|
||||
ptrdiff_t iblock, ptrdiff_t oblock,
|
||||
C *in, R *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
XM(ddim) *dims = simple_dims(rnk, n);
|
||||
X(plan) pln;
|
||||
|
||||
if (rnk == 1) {
|
||||
dims[0].ib = iblock;
|
||||
dims[0].ob = oblock;
|
||||
}
|
||||
else if (rnk > 1) {
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
||||
dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
||||
}
|
||||
|
||||
pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags);
|
||||
X(ifree)(dims);
|
||||
return pln;
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
return XM(plan_many_dft_r2c)(rnk, n, 1,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
in, out, comm, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[2];
|
||||
n[0] = nx; n[1] = ny;
|
||||
return XM(plan_dft_r2c)(2, n, in, out, comm, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
||||
R *in, C *out, MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[3];
|
||||
n[0] = nx; n[1] = ny; n[2] = nz;
|
||||
return XM(plan_dft_r2c)(3, n, in, out, comm, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
return XM(plan_many_dft_c2r)(rnk, n, 1,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
in, out, comm, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out,
|
||||
MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[2];
|
||||
n[0] = nx; n[1] = ny;
|
||||
return XM(plan_dft_c2r)(2, n, in, out, comm, flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
||||
C *in, R *out, MPI_Comm comm, unsigned flags)
|
||||
{
|
||||
ptrdiff_t n[3];
|
||||
n[0] = nx; n[1] = ny; n[2] = nz;
|
||||
return XM(plan_dft_c2r)(3, n, in, out, comm, flags);
|
||||
}
|
||||
|
||||
/*************************************************************************/
|
||||
/* New-array execute functions */
|
||||
|
||||
void XM(execute_dft)(const X(plan) p, C *in, C *out) {
|
||||
/* internally, MPI plans are just rdft plans */
|
||||
X(execute_r2r)(p, (R*) in, (R*) out);
|
||||
}
|
||||
|
||||
void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) {
|
||||
/* internally, MPI plans are just rdft plans */
|
||||
X(execute_r2r)(p, in, (R*) out);
|
||||
}
|
||||
|
||||
void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) {
|
||||
/* internally, MPI plans are just rdft plans */
|
||||
X(execute_r2r)(p, (R*) in, out);
|
||||
}
|
||||
|
||||
void XM(execute_r2r)(const X(plan) p, R *in, R *out) {
|
||||
/* internally, MPI plans are just rdft plans */
|
||||
X(execute_r2r)(p, in, out);
|
||||
}
|
||||
131
fftw-3.3.10/mpi/block.c
Normal file
131
fftw-3.3.10/mpi/block.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
INT XM(num_blocks)(INT n, INT block)
|
||||
{
|
||||
return (n + block - 1) / block;
|
||||
}
|
||||
|
||||
int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm)
|
||||
{
|
||||
int n_pes;
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
return n_pes >= XM(num_blocks)(n, block);
|
||||
}
|
||||
|
||||
/* Pick a default block size for dividing a problem of size n among
|
||||
n_pes processes. Divide as equally as possible, while minimizing
|
||||
the maximum block size among the processes as well as the number of
|
||||
processes with nonzero blocks. */
|
||||
INT XM(default_block)(INT n, int n_pes)
|
||||
{
|
||||
return ((n + n_pes - 1) / n_pes);
|
||||
}
|
||||
|
||||
/* For a given block size and dimension n, compute the block size
|
||||
on the given process. */
|
||||
INT XM(block)(INT n, INT block, int which_block)
|
||||
{
|
||||
INT d = n - which_block * block;
|
||||
return d <= 0 ? 0 : (d > block ? block : d);
|
||||
}
|
||||
|
||||
static INT num_blocks_kind(const ddim *dim, block_kind k)
|
||||
{
|
||||
return XM(num_blocks)(dim->n, dim->b[k]);
|
||||
}
|
||||
|
||||
INT XM(num_blocks_total)(const dtensor *sz, block_kind k)
|
||||
{
|
||||
if (FINITE_RNK(sz->rnk)) {
|
||||
int i;
|
||||
INT ntot = 1;
|
||||
for (i = 0; i < sz->rnk; ++i)
|
||||
ntot *= num_blocks_kind(sz->dims + i, k);
|
||||
return ntot;
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe)
|
||||
{
|
||||
return (which_pe >= XM(num_blocks_total)(sz, k));
|
||||
}
|
||||
|
||||
/* Given a non-idle process which_pe, computes the coordinate
|
||||
vector coords[rnk] giving the coordinates of a block in the
|
||||
matrix of blocks. k specifies whether we are talking about
|
||||
the input or output data distribution. */
|
||||
void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe,
|
||||
INT *coords)
|
||||
{
|
||||
int i;
|
||||
A(!XM(idle_process)(sz, k, which_pe) && FINITE_RNK(sz->rnk));
|
||||
for (i = sz->rnk - 1; i >= 0; --i) {
|
||||
INT nb = num_blocks_kind(sz->dims + i, k);
|
||||
coords[i] = which_pe % nb;
|
||||
which_pe /= nb;
|
||||
}
|
||||
}
|
||||
|
||||
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe)
|
||||
{
|
||||
if (XM(idle_process)(sz, k, which_pe))
|
||||
return 0;
|
||||
else {
|
||||
int i;
|
||||
INT N = 1, *coords;
|
||||
STACK_MALLOC(INT*, coords, sizeof(INT) * sz->rnk);
|
||||
XM(block_coords)(sz, k, which_pe, coords);
|
||||
for (i = 0; i < sz->rnk; ++i)
|
||||
N *= XM(block)(sz->dims[i].n, sz->dims[i].b[k], coords[i]);
|
||||
STACK_FREE(coords);
|
||||
return N;
|
||||
}
|
||||
}
|
||||
|
||||
/* returns whether sz is local for dims >= dim */
|
||||
int XM(is_local_after)(int dim, const dtensor *sz, block_kind k)
|
||||
{
|
||||
if (FINITE_RNK(sz->rnk))
|
||||
for (; dim < sz->rnk; ++dim)
|
||||
if (XM(num_blocks)(sz->dims[dim].n, sz->dims[dim].b[k]) > 1)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int XM(is_local)(const dtensor *sz, block_kind k)
|
||||
{
|
||||
return XM(is_local_after)(0, sz, k);
|
||||
}
|
||||
|
||||
/* Return whether sz is distributed for k according to a simple
|
||||
1d block distribution in the first or second dimensions */
|
||||
int XM(is_block1d)(const dtensor *sz, block_kind k)
|
||||
{
|
||||
int i;
|
||||
if (!FINITE_RNK(sz->rnk)) return 0;
|
||||
for (i = 0; i < sz->rnk && num_blocks_kind(sz->dims + i, k) == 1; ++i) ;
|
||||
return(i < sz->rnk && i < 2 && XM(is_local_after)(i + 1, sz, k));
|
||||
|
||||
}
|
||||
83
fftw-3.3.10/mpi/choose-radix.c
Normal file
83
fftw-3.3.10/mpi/choose-radix.c
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* Return the radix r for a 1d MPI transform of a distributed dimension d,
|
||||
with the given flags and transform size. That is, decomposes d.n
|
||||
as r * m, Cooley-Tukey style. Also computes the block sizes rblock
|
||||
and mblock. Returns 0 if such a decomposition is not feasible.
|
||||
This is unfortunately somewhat complicated.
|
||||
|
||||
A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
|
||||
|
||||
d.n is initially distributed as an m x r array with block size mblock[IB].
|
||||
Then it is internally transposed to an r x m array with block size
|
||||
rblock[IB]. Then it is internally transposed to m x r again with block
|
||||
size mblock[OB]. Finally, it is transposed to r x m with block size
|
||||
rblock[IB].
|
||||
|
||||
If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
|
||||
starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose
|
||||
is skipped (the array ends up as m x r). To make sure the forward
|
||||
and backward transforms use the same "scrambling" format, we swap r
|
||||
and m when sign != FFT_SIGN.
|
||||
|
||||
There are some downsides to this, especially in the case where
|
||||
either m or r is not divisible by n_pes. For one thing, it means
|
||||
that in general we can't use the same block size for the input and
|
||||
output. For another thing, it means that we can't in general honor
|
||||
a user's "requested" block sizes in d.b[]. Therefore, for simplicity,
|
||||
we simply ignore d.b[] for now.
|
||||
*/
|
||||
INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
|
||||
INT rblock[2], INT mblock[2])
|
||||
{
|
||||
INT r, m;
|
||||
|
||||
UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
|
||||
|
||||
/* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
|
||||
This not only ensures that the input (the m dimension) is
|
||||
equally distributed if possible, and at the r dimension is
|
||||
maximally equally distributed (if d.n/n_pes >= n_pes), it also
|
||||
makes one of the local transpositions in the algorithm
|
||||
trivial. */
|
||||
if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
|
||||
&& d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
|
||||
r = d.n / n_pes;
|
||||
else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
|
||||
for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
|
||||
;
|
||||
}
|
||||
if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
|
||||
|
||||
if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
|
||||
m = r;
|
||||
r = d.n / m;
|
||||
}
|
||||
else
|
||||
m = d.n / r;
|
||||
|
||||
rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
|
||||
mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
|
||||
|
||||
return r;
|
||||
}
|
||||
50
fftw-3.3.10/mpi/conf.c
Normal file
50
fftw-3.3.10/mpi/conf.c
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "mpi-transpose.h"
|
||||
#include "mpi-dft.h"
|
||||
#include "mpi-rdft.h"
|
||||
#include "mpi-rdft2.h"
|
||||
|
||||
static const solvtab s =
|
||||
{
|
||||
SOLVTAB(XM(transpose_pairwise_register)),
|
||||
SOLVTAB(XM(transpose_alltoall_register)),
|
||||
SOLVTAB(XM(transpose_recurse_register)),
|
||||
SOLVTAB(XM(dft_rank_geq2_register)),
|
||||
SOLVTAB(XM(dft_rank_geq2_transposed_register)),
|
||||
SOLVTAB(XM(dft_serial_register)),
|
||||
SOLVTAB(XM(dft_rank1_bigvec_register)),
|
||||
SOLVTAB(XM(dft_rank1_register)),
|
||||
SOLVTAB(XM(rdft_rank_geq2_register)),
|
||||
SOLVTAB(XM(rdft_rank_geq2_transposed_register)),
|
||||
SOLVTAB(XM(rdft_serial_register)),
|
||||
SOLVTAB(XM(rdft_rank1_bigvec_register)),
|
||||
SOLVTAB(XM(rdft2_rank_geq2_register)),
|
||||
SOLVTAB(XM(rdft2_rank_geq2_transposed_register)),
|
||||
SOLVTAB(XM(rdft2_serial_register)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
|
||||
void XM(conf_standard)(planner *p)
|
||||
{
|
||||
X(solvtab_exec)(s, p);
|
||||
}
|
||||
136
fftw-3.3.10/mpi/dft-problem.c
Normal file
136
fftw-3.3.10/mpi/dft-problem.c
Normal file
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-dft.h"
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_mpi_dft *ego = (problem_mpi_dft *) ego_;
|
||||
XM(dtensor_destroy)(ego->sz);
|
||||
MPI_Comm_free(&ego->comm);
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
int i;
|
||||
X(md5puts)(m, "mpi-dft");
|
||||
X(md5int)(m, p->I == p->O);
|
||||
/* don't include alignment -- may differ between processes
|
||||
X(md5int)(m, X(ialignment_of)(p->I));
|
||||
X(md5int)(m, X(ialignment_of)(p->O));
|
||||
... note that applicability of MPI plans does not depend
|
||||
on alignment (although optimality may, in principle). */
|
||||
XM(dtensor_md5)(m, p->sz);
|
||||
X(md5INT)(m, p->vn);
|
||||
X(md5int)(m, p->sign);
|
||||
X(md5int)(m, p->flags);
|
||||
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
|
||||
A(XM(md5_equal)(*m, p->comm));
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
|
||||
int i;
|
||||
p->print(p, "(mpi-dft %d %d %d ",
|
||||
ego->I == ego->O,
|
||||
X(ialignment_of)(ego->I),
|
||||
X(ialignment_of)(ego->O));
|
||||
XM(dtensor_print)(ego->sz, p);
|
||||
p->print(p, " %D %d %d", ego->vn, ego->sign, ego->flags);
|
||||
MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_mpi_dft *ego = (const problem_mpi_dft *) ego_;
|
||||
R *I = ego->I;
|
||||
INT i, N;
|
||||
int my_pe;
|
||||
|
||||
MPI_Comm_rank(ego->comm, &my_pe);
|
||||
N = 2 * ego->vn * XM(total_block)(ego->sz, IB, my_pe);
|
||||
for (i = 0; i < N; ++i) I[i] = K(0.0);
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_MPI_DFT,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
|
||||
R *I, R *O,
|
||||
MPI_Comm comm,
|
||||
int sign,
|
||||
unsigned flags)
|
||||
{
|
||||
problem_mpi_dft *ego =
|
||||
(problem_mpi_dft *)X(mkproblem)(sizeof(problem_mpi_dft), &padt);
|
||||
int n_pes;
|
||||
|
||||
A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
A(n_pes >= XM(num_blocks_total)(sz, IB)
|
||||
&& n_pes >= XM(num_blocks_total)(sz, OB));
|
||||
A(vn >= 0);
|
||||
A(sign == -1 || sign == 1);
|
||||
|
||||
/* enforce pointer equality if untainted pointers are equal */
|
||||
if (UNTAINT(I) == UNTAINT(O))
|
||||
I = O = JOIN_TAINT(I, O);
|
||||
|
||||
ego->sz = XM(dtensor_canonical)(sz, 1);
|
||||
ego->vn = vn;
|
||||
ego->I = I;
|
||||
ego->O = O;
|
||||
ego->sign = sign;
|
||||
|
||||
/* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
|
||||
swapping the first two dimensions (for rnk > 1) */
|
||||
if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
|
||||
ddim dim0 = ego->sz->dims[0];
|
||||
ego->sz->dims[0] = ego->sz->dims[1];
|
||||
ego->sz->dims[1] = dim0;
|
||||
flags &= ~TRANSPOSED_IN;
|
||||
flags ^= TRANSPOSED_OUT;
|
||||
}
|
||||
ego->flags = flags;
|
||||
|
||||
MPI_Comm_dup(comm, &ego->comm);
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
|
||||
problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
|
||||
R *I, R *O,
|
||||
MPI_Comm comm,
|
||||
int sign,
|
||||
unsigned flags)
|
||||
{
|
||||
problem *p = XM(mkproblem_dft)(sz, vn, I, O, comm, sign, flags);
|
||||
XM(dtensor_destroy)(sz);
|
||||
return p;
|
||||
}
|
||||
221
fftw-3.3.10/mpi/dft-rank-geq2-transposed.c
Normal file
221
fftw-3.3.10/mpi/dft-rank-geq2-transposed.c
Normal file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex DFTs of rank >= 2, for the case where we are distributed
|
||||
across the first dimension only, and the output is transposed both
|
||||
in data distribution and in ordering (for the first 2 dimensions).
|
||||
|
||||
(Note that we don't have to handle the case where the input is
|
||||
transposed, since this is equivalent to transposed output with the
|
||||
first two dimensions swapped, and is automatically canonicalized as
|
||||
such by dft-problem.c. */
|
||||
|
||||
#include "mpi-dft.h"
|
||||
#include "mpi-transpose.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_dft super;
|
||||
|
||||
plan *cld1, *cldt, *cld2;
|
||||
INT roff, ioff;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld1, *cld2;
|
||||
plan_rdft *cldt;
|
||||
INT roff = ego->roff, ioff = ego->ioff;
|
||||
|
||||
/* DFT local dimensions */
|
||||
cld1 = (plan_dft *) ego->cld1;
|
||||
if (ego->preserve_input) {
|
||||
cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
|
||||
|
||||
/* global transpose */
|
||||
cldt = (plan_rdft *) ego->cldt;
|
||||
cldt->apply(ego->cldt, I, O);
|
||||
|
||||
/* DFT final local dimension */
|
||||
cld2 = (plan_dft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, O+roff, O+ioff, O+roff, O+ioff);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk > 1
|
||||
&& p->flags == TRANSPOSED_OUT
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& XM(is_local_after)(1, p->sz, IB)
|
||||
&& XM(is_local_after)(2, p->sz, OB)
|
||||
&& XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
|
||||
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|
||||
|| !XM(dft_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cldt, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cldt);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-dft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"",
|
||||
ego->cld1, ego->cldt, ego->cld2);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_dft *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cldt = 0, *cld2 = 0;
|
||||
R *ri, *ii, *ro, *io, *I, *O;
|
||||
tensor *sz;
|
||||
int i, my_pe, n_pes;
|
||||
INT nrest;
|
||||
static const plan_adt padt = {
|
||||
XM(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_dft *) p_;
|
||||
|
||||
X(extract_reim)(p->sign, I = p->I, &ri, &ii);
|
||||
X(extract_reim)(p->sign, O = p->O, &ro, &io);
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
|
||||
I = O;
|
||||
else {
|
||||
ro = ri;
|
||||
io = ii;
|
||||
}
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
|
||||
i = p->sz->rnk - 2; A(i >= 0);
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
|
||||
for (--i; i >= 0; --i) {
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
|
||||
}
|
||||
nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
|
||||
{
|
||||
INT is = sz->dims[0].n * sz->dims[0].is;
|
||||
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(sz,
|
||||
X(mktensor_2d)(b, is, is,
|
||||
p->vn, 2, 2),
|
||||
ri, ii, ro, io));
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
nrest *= p->vn;
|
||||
cldt = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
p->sz->dims[0].n, p->sz->dims[1].n, nrest * 2,
|
||||
I, O,
|
||||
p->sz->dims[0].b[IB], p->sz->dims[1].b[OB],
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt, p->comm)) goto nada;
|
||||
|
||||
X(extract_reim)(p->sign, O, &ro, &io);
|
||||
{
|
||||
INT is = p->sz->dims[0].n * nrest * 2;
|
||||
INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
|
||||
cld2 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(
|
||||
p->sz->dims[0].n,
|
||||
nrest * 2, nrest * 2),
|
||||
X(mktensor_2d)(b, is, is,
|
||||
nrest, 2, 2),
|
||||
ro, io, ro, io));
|
||||
if (XM(any_true)(!cld2, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
pln = MKPLAN_MPI_DFT(P, &padt, apply);
|
||||
pln->cld1 = cld1;
|
||||
pln->cldt = cldt;
|
||||
pln->cld2 = cld2;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->roff = ri - p->I;
|
||||
pln->ioff = ii - p->I;
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
X(ops_add2)(&cldt->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cldt);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(dft_rank_geq2_transposed_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
188
fftw-3.3.10/mpi/dft-rank-geq2.c
Normal file
188
fftw-3.3.10/mpi/dft-rank-geq2.c
Normal file
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex DFTs of rank >= 2, for the case where we are distributed
|
||||
across the first dimension only, and the output is not transposed. */
|
||||
|
||||
#include "mpi-dft.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_dft super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
INT roff, ioff;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld1;
|
||||
plan_rdft *cld2;
|
||||
INT roff = ego->roff, ioff = ego->ioff;
|
||||
|
||||
/* DFT local dimensions */
|
||||
cld1 = (plan_dft *) ego->cld1;
|
||||
if (ego->preserve_input) {
|
||||
cld1->apply(ego->cld1, I+roff, I+ioff, O+roff, O+ioff);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld1->apply(ego->cld1, I+roff, I+ioff, I+roff, I+ioff);
|
||||
|
||||
/* DFT non-local dimension (via dft-rank1-bigvec, usually): */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, I, O);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk > 1
|
||||
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& XM(is_local_after)(1, p->sz, IB)
|
||||
&& XM(is_local_after)(1, p->sz, OB)
|
||||
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|
||||
|| !XM(dft_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-dft-rank-geq2%s%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_dft *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0;
|
||||
R *ri, *ii, *ro, *io, *I, *O;
|
||||
tensor *sz;
|
||||
dtensor *sz2;
|
||||
int i, my_pe, n_pes;
|
||||
INT nrest;
|
||||
static const plan_adt padt = {
|
||||
XM(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_dft *) p_;
|
||||
|
||||
X(extract_reim)(p->sign, I = p->I, &ri, &ii);
|
||||
X(extract_reim)(p->sign, O = p->O, &ro, &io);
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
|
||||
I = O;
|
||||
else {
|
||||
ro = ri;
|
||||
io = ii;
|
||||
}
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
|
||||
i = p->sz->rnk - 2; A(i >= 0);
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
|
||||
for (--i; i >= 0; --i) {
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
|
||||
}
|
||||
nrest = X(tensor_sz)(sz);
|
||||
{
|
||||
INT is = sz->dims[0].n * sz->dims[0].is;
|
||||
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(sz,
|
||||
X(mktensor_2d)(b, is, is,
|
||||
p->vn, 2, 2),
|
||||
ri, ii, ro, io));
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
|
||||
sz2->dims[0] = p->sz->dims[0];
|
||||
cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
|
||||
I, O, p->comm, p->sign,
|
||||
RANK1_BIGVEC_ONLY));
|
||||
if (XM(any_true)(!cld2, p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_DFT(P, &padt, apply);
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->roff = ri - p->I;
|
||||
pln->ioff = ii - p->I;
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(dft_rank_geq2_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
211
fftw-3.3.10/mpi/dft-rank1-bigvec.c
Normal file
211
fftw-3.3.10/mpi/dft-rank1-bigvec.c
Normal file
@@ -0,0 +1,211 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex DFTs of rank == 1 when the vector length vn is >= # processes.
|
||||
In this case, we don't need to use a six-step type algorithm, and can
|
||||
instead transpose the DFT dimension with the vector dimension to
|
||||
make the DFT local. */
|
||||
|
||||
#include "mpi-dft.h"
|
||||
#include "mpi-transpose.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
rearrangement rearrange;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_dft super;
|
||||
|
||||
plan *cldt_before, *cld, *cldt_after;
|
||||
INT roff, ioff;
|
||||
int preserve_input;
|
||||
rearrangement rearrange;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
plan_rdft *cldt_before, *cldt_after;
|
||||
INT roff = ego->roff, ioff = ego->ioff;
|
||||
|
||||
/* global transpose */
|
||||
cldt_before = (plan_rdft *) ego->cldt_before;
|
||||
cldt_before->apply(ego->cldt_before, I, O);
|
||||
|
||||
if (ego->preserve_input) I = O;
|
||||
|
||||
/* 1d DFT(s) */
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, O+roff, O+ioff, I+roff, I+ioff);
|
||||
|
||||
/* global transpose */
|
||||
cldt_after = (plan_rdft *) ego->cldt_after;
|
||||
cldt_after->apply(ego->cldt_after, I, O);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
int n_pes;
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& !(p->flags & ~RANK1_BIGVEC_ONLY)
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& (p->vn >= n_pes /* TODO: relax this, using more memory? */
|
||||
|| (p->flags & RANK1_BIGVEC_ONLY))
|
||||
|
||||
&& XM(rearrange_applicable)(ego->rearrange,
|
||||
p->sz->dims[0], p->vn, n_pes)
|
||||
|
||||
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|
||||
|| !XM(dft_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldt_before, wakefulness);
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldt_after, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldt_after);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
X(plan_destroy_internal)(ego->cldt_before);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const char descrip[][16] = { "contig", "discontig", "square-after",
|
||||
"square-middle", "square-before" };
|
||||
p->print(p, "(mpi-dft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
|
||||
descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
|
||||
ego->cldt_before, ego->cld, ego->cldt_after);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_dft *p;
|
||||
P *pln;
|
||||
plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
|
||||
R *ri, *ii, *ro, *io, *I, *O;
|
||||
INT yblock, yb, nx, ny, vn;
|
||||
int my_pe, n_pes;
|
||||
static const plan_adt padt = {
|
||||
XM(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_dft *) p_;
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
nx = p->sz->dims[0].n;
|
||||
if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
|
||||
return (plan *) 0;
|
||||
vn = p->vn / ny;
|
||||
A(ny * vn == p->vn);
|
||||
|
||||
yblock = XM(default_block)(ny, n_pes);
|
||||
cldt_before = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
nx, ny, vn*2,
|
||||
I = p->I, O = p->O,
|
||||
p->sz->dims[0].b[IB], yblock,
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt_before, p->comm)) goto nada;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
|
||||
|
||||
X(extract_reim)(p->sign, I, &ri, &ii);
|
||||
X(extract_reim)(p->sign, O, &ro, &io);
|
||||
|
||||
yb = XM(block)(ny, yblock, my_pe);
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(nx, vn*2, vn*2),
|
||||
X(mktensor_2d)(yb, vn*2*nx, vn*2*nx,
|
||||
vn, 2, 2),
|
||||
ro, io, ri, ii));
|
||||
if (XM(any_true)(!cld, p->comm)) goto nada;
|
||||
|
||||
cldt_after = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
ny, nx, vn*2,
|
||||
I, O,
|
||||
yblock, p->sz->dims[0].b[OB],
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt_after, p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_DFT(P, &padt, apply);
|
||||
|
||||
pln->cldt_before = cldt_before;
|
||||
pln->cld = cld;
|
||||
pln->cldt_after = cldt_after;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->roff = ro - p->O;
|
||||
pln->ioff = io - p->O;
|
||||
pln->rearrange = ego->rearrange;
|
||||
|
||||
X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
|
||||
X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldt_after);
|
||||
X(plan_destroy_internal)(cld);
|
||||
X(plan_destroy_internal)(cldt_before);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(rearrangement rearrange, int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->rearrange = rearrange;
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(dft_rank1_bigvec_register)(planner *p)
|
||||
{
|
||||
rearrangement rearrange;
|
||||
int preserve_input;
|
||||
FORALL_REARRANGE(rearrange)
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
|
||||
}
|
||||
352
fftw-3.3.10/mpi/dft-rank1.c
Normal file
352
fftw-3.3.10/mpi/dft-rank1.c
Normal file
@@ -0,0 +1,352 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex DFTs of rank == 1 via six-step algorithm. */
|
||||
|
||||
#include "mpi-dft.h"
|
||||
#include "mpi-transpose.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
rdftapply apply; /* apply_ddft_first or apply_ddft_last */
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_dft super;
|
||||
|
||||
triggen *t;
|
||||
plan *cldt, *cld_ddft, *cld_dft;
|
||||
INT roff, ioff;
|
||||
int preserve_input;
|
||||
INT vn, xmin, xmax, xs, m, r;
|
||||
} P;
|
||||
|
||||
static void do_twiddle(triggen *t, INT ir, INT m, INT vn, R *xr, R *xi)
|
||||
{
|
||||
void (*rotate)(triggen *, INT, R, R, R *) = t->rotate;
|
||||
INT im, iv;
|
||||
for (im = 0; im < m; ++im)
|
||||
for (iv = 0; iv < vn; ++iv) {
|
||||
/* TODO: modify/inline rotate function
|
||||
so that it can do whole vn vector at once? */
|
||||
R c[2];
|
||||
rotate(t, ir * im, *xr, *xi, c);
|
||||
*xr = c[0]; *xi = c[1];
|
||||
xr += 2; xi += 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* radix-r DFT of size r*m. This is equivalent to an m x r 2d DFT,
|
||||
plus twiddle factors between the size-m and size-r 1d DFTs, where
|
||||
the m dimension is initially distributed. The output is transposed
|
||||
to r x m where the r dimension is distributed.
|
||||
|
||||
This algorithm follows the general sequence:
|
||||
global transpose (m x r -> r x m)
|
||||
DFTs of size m
|
||||
multiply by twiddles + global transpose (r x m -> m x r)
|
||||
DFTs of size r
|
||||
global transpose (m x r -> r x m)
|
||||
where the multiplication by twiddles can come before or after
|
||||
the middle transpose. The first/last transposes are omitted
|
||||
for SCRAMBLED_IN/OUT formats, respectively.
|
||||
|
||||
However, we wish to exploit our dft-rank1-bigvec solver, which
|
||||
solves a vector of distributed DFTs via transpose+dft+transpose.
|
||||
Therefore, we can group *either* the DFTs of size m *or* the
|
||||
DFTs of size r with their surrounding transposes as a single
|
||||
distributed-DFT (ddft) plan. These two variations correspond to
|
||||
apply_ddft_first or apply_ddft_last, respectively.
|
||||
*/
|
||||
|
||||
static void apply_ddft_first(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld_dft;
|
||||
plan_rdft *cldt, *cld_ddft;
|
||||
INT roff, ioff, im, mmax, ms, r, vn;
|
||||
triggen *t;
|
||||
R *dI, *dO;
|
||||
|
||||
/* distributed size-m DFTs, with output in m x r format */
|
||||
cld_ddft = (plan_rdft *) ego->cld_ddft;
|
||||
cld_ddft->apply(ego->cld_ddft, I, O);
|
||||
|
||||
cldt = (plan_rdft *) ego->cldt;
|
||||
if (ego->preserve_input || !cldt) I = O;
|
||||
|
||||
/* twiddle multiplications, followed by 1d DFTs of size-r */
|
||||
cld_dft = (plan_dft *) ego->cld_dft;
|
||||
roff = ego->roff; ioff = ego->ioff;
|
||||
mmax = ego->xmax; ms = ego->xs;
|
||||
t = ego->t; r = ego->r; vn = ego->vn;
|
||||
dI = O; dO = I;
|
||||
for (im = ego->xmin; im <= mmax; ++im) {
|
||||
do_twiddle(t, im, r, vn, dI+roff, dI+ioff);
|
||||
cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
|
||||
dI += ms; dO += ms;
|
||||
}
|
||||
|
||||
/* final global transpose (m x r -> r x m), if not SCRAMBLED_OUT */
|
||||
if (cldt)
|
||||
cldt->apply((plan *) cldt, I, O);
|
||||
}
|
||||
|
||||
static void apply_ddft_last(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld_dft;
|
||||
plan_rdft *cldt, *cld_ddft;
|
||||
INT roff, ioff, ir, rmax, rs, m, vn;
|
||||
triggen *t;
|
||||
R *dI, *dO0, *dO;
|
||||
|
||||
/* initial global transpose (m x r -> r x m), if not SCRAMBLED_IN */
|
||||
cldt = (plan_rdft *) ego->cldt;
|
||||
if (cldt) {
|
||||
cldt->apply((plan *) cldt, I, O);
|
||||
dI = O;
|
||||
}
|
||||
else
|
||||
dI = I;
|
||||
if (ego->preserve_input) dO = O; else dO = I;
|
||||
dO0 = dO;
|
||||
|
||||
/* 1d DFTs of size m, followed by twiddle multiplications */
|
||||
cld_dft = (plan_dft *) ego->cld_dft;
|
||||
roff = ego->roff; ioff = ego->ioff;
|
||||
rmax = ego->xmax; rs = ego->xs;
|
||||
t = ego->t; m = ego->m; vn = ego->vn;
|
||||
for (ir = ego->xmin; ir <= rmax; ++ir) {
|
||||
cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
|
||||
do_twiddle(t, ir, m, vn, dO+roff, dO+ioff);
|
||||
dI += rs; dO += rs;
|
||||
}
|
||||
|
||||
/* distributed size-r DFTs, with output in r x m format */
|
||||
cld_ddft = (plan_rdft *) ego->cld_ddft;
|
||||
cld_ddft->apply(ego->cld_ddft, dO0, O);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr,
|
||||
INT *r, INT rblock[2], INT mblock[2])
|
||||
{
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
int n_pes;
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
|
||||
&& ONLY_SCRAMBLEDP(p->flags)
|
||||
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
|
||||
&& (!(p->flags & SCRAMBLED_IN) || ego->apply == apply_ddft_last)
|
||||
&& (!(p->flags & SCRAMBLED_OUT) || ego->apply == apply_ddft_first)
|
||||
|
||||
&& (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
|
||||
|| !XM(dft_serial_applicable)(p))
|
||||
|
||||
/* disallow if dft-rank1-bigvec is applicable since the
|
||||
data distribution may be slightly different (ugh!) */
|
||||
&& (p->vn < n_pes || p->flags)
|
||||
|
||||
&& (*r = XM(choose_radix)(p->sz->dims[0], n_pes,
|
||||
p->flags, p->sign,
|
||||
rblock, mblock))
|
||||
|
||||
/* ddft_first or last has substantial advantages in the
|
||||
bigvec transpositions for the common case where
|
||||
n_pes == n/r or r, respectively */
|
||||
&& (!NO_UGLYP(plnr)
|
||||
|| !(*r == n_pes && ego->apply == apply_ddft_first)
|
||||
|| !(p->sz->dims[0].n / *r == n_pes
|
||||
&& ego->apply == apply_ddft_last))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldt, wakefulness);
|
||||
X(plan_awake)(ego->cld_dft, wakefulness);
|
||||
X(plan_awake)(ego->cld_ddft, wakefulness);
|
||||
|
||||
switch (wakefulness) {
|
||||
case SLEEPY:
|
||||
X(triggen_destroy)(ego->t); ego->t = 0;
|
||||
break;
|
||||
default:
|
||||
ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldt);
|
||||
X(plan_destroy_internal)(ego->cld_dft);
|
||||
X(plan_destroy_internal)(ego->cld_ddft);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-dft-rank1/%D%s%s%(%p%)%(%p%)%(%p%))",
|
||||
ego->r,
|
||||
ego->super.apply == apply_ddft_first ? "/first" : "/last",
|
||||
ego->preserve_input==2 ?"/p":"",
|
||||
ego->cld_ddft, ego->cld_dft, ego->cldt);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_dft *p;
|
||||
P *pln;
|
||||
plan *cld_dft = 0, *cld_ddft = 0, *cldt = 0;
|
||||
R *ri, *ii, *ro, *io, *I, *O;
|
||||
INT r, rblock[2], m, mblock[2], rp, mp, mpblock[2], mpb;
|
||||
int my_pe, n_pes, preserve_input, ddft_first;
|
||||
dtensor *sz;
|
||||
static const plan_adt padt = {
|
||||
XM(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr, &r, rblock, mblock))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_dft *) p_;
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
m = p->sz->dims[0].n / r;
|
||||
|
||||
/* some hackery so that we can plan both ddft_first and ddft_last
|
||||
as if they were ddft_first */
|
||||
if ((ddft_first = (ego->apply == apply_ddft_first))) {
|
||||
rp = r; mp = m;
|
||||
mpblock[IB] = mblock[IB]; mpblock[OB] = mblock[OB];
|
||||
mpb = XM(block)(mp, mpblock[OB], my_pe);
|
||||
}
|
||||
else {
|
||||
rp = m; mp = r;
|
||||
mpblock[IB] = rblock[IB]; mpblock[OB] = rblock[OB];
|
||||
mpb = XM(block)(mp, mpblock[IB], my_pe);
|
||||
}
|
||||
|
||||
preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
|
||||
sz = XM(mkdtensor)(1);
|
||||
sz->dims[0].n = mp;
|
||||
sz->dims[0].b[IB] = mpblock[IB];
|
||||
sz->dims[0].b[OB] = mpblock[OB];
|
||||
I = (ddft_first || !preserve_input) ? p->I : p->O;
|
||||
O = p->O;
|
||||
cld_ddft = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz, rp * p->vn,
|
||||
I, O, p->comm, p->sign,
|
||||
RANK1_BIGVEC_ONLY));
|
||||
if (XM(any_true)(!cld_ddft, p->comm)) goto nada;
|
||||
|
||||
I = TAINT((ddft_first || !p->flags) ? p->O : p->I, rp * p->vn * 2);
|
||||
O = TAINT((preserve_input || (ddft_first && p->flags)) ? p->O : p->I,
|
||||
rp * p->vn * 2);
|
||||
X(extract_reim)(p->sign, I, &ri, &ii);
|
||||
X(extract_reim)(p->sign, O, &ro, &io);
|
||||
cld_dft = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(rp, p->vn*2,p->vn*2),
|
||||
X(mktensor_1d)(p->vn, 2, 2),
|
||||
ri, ii, ro, io));
|
||||
if (XM(any_true)(!cld_dft, p->comm)) goto nada;
|
||||
|
||||
if (!p->flags) { /* !(SCRAMBLED_IN or SCRAMBLED_OUT) */
|
||||
I = (ddft_first && preserve_input) ? p->O : p->I;
|
||||
O = p->O;
|
||||
cldt = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
m, r, p->vn * 2,
|
||||
I, O,
|
||||
ddft_first ? mblock[OB] : mblock[IB],
|
||||
ddft_first ? rblock[OB] : rblock[IB],
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
pln = MKPLAN_MPI_DFT(P, &padt, ego->apply);
|
||||
|
||||
pln->cld_ddft = cld_ddft;
|
||||
pln->cld_dft = cld_dft;
|
||||
pln->cldt = cldt;
|
||||
pln->preserve_input = preserve_input;
|
||||
X(extract_reim)(p->sign, p->O, &ro, &io);
|
||||
pln->roff = ro - p->O;
|
||||
pln->ioff = io - p->O;
|
||||
pln->vn = p->vn;
|
||||
pln->m = m;
|
||||
pln->r = r;
|
||||
pln->xmin = (ddft_first ? mblock[OB] : rblock[IB]) * my_pe;
|
||||
pln->xmax = pln->xmin + mpb - 1;
|
||||
pln->xs = rp * p->vn * 2;
|
||||
pln->t = 0;
|
||||
|
||||
X(ops_add)(&cld_ddft->ops, &cld_dft->ops, &pln->super.super.ops);
|
||||
if (cldt) X(ops_add2)(&cldt->ops, &pln->super.super.ops);
|
||||
{
|
||||
double n0 = (1 + pln->xmax - pln->xmin) * (mp - 1) * pln->vn;
|
||||
pln->super.super.ops.mul += 8 * n0;
|
||||
pln->super.super.ops.add += 4 * n0;
|
||||
pln->super.super.ops.other += 8 * n0;
|
||||
}
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldt);
|
||||
X(plan_destroy_internal)(cld_dft);
|
||||
X(plan_destroy_internal)(cld_ddft);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(rdftapply apply, int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->apply = apply;
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(dft_rank1_register)(planner *p)
|
||||
{
|
||||
rdftapply apply[] = { apply_ddft_first, apply_ddft_last };
|
||||
unsigned int iapply;
|
||||
int preserve_input;
|
||||
for (iapply = 0; iapply < sizeof(apply) / sizeof(apply[0]); ++iapply)
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(apply[iapply], preserve_input));
|
||||
}
|
||||
130
fftw-3.3.10/mpi/dft-serial.c
Normal file
130
fftw-3.3.10/mpi/dft-serial.c
Normal file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* "MPI" DFTs where all of the data is on one processor...just
|
||||
call through to serial API. */
|
||||
|
||||
#include "mpi-dft.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_dft super;
|
||||
plan *cld;
|
||||
INT roff, ioff;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_dft *cld;
|
||||
INT roff = ego->roff, ioff = ego->ioff;
|
||||
cld = (plan_dft *) ego->cld;
|
||||
cld->apply(ego->cld, I+roff, I+ioff, O+roff, O+ioff);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-dft-serial %(%p%))", ego->cld);
|
||||
}
|
||||
|
||||
int XM(dft_serial_applicable)(const problem_mpi_dft *p)
|
||||
{
|
||||
return (1
|
||||
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
|
||||
&& ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
|
||||
|| p->vn == 0));
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
P *pln;
|
||||
plan *cld;
|
||||
int my_pe;
|
||||
R *ri, *ii, *ro, *io;
|
||||
static const plan_adt padt = {
|
||||
XM(dft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
/* check whether applicable: */
|
||||
if (!XM(dft_serial_applicable)(p))
|
||||
return (plan *) 0;
|
||||
|
||||
X(extract_reim)(p->sign, p->I, &ri, &ii);
|
||||
X(extract_reim)(p->sign, p->O, &ro, &io);
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
if (my_pe == 0 && p->vn > 0) {
|
||||
int i, rnk = p->sz->rnk;
|
||||
tensor *sz = X(mktensor)(p->sz->rnk);
|
||||
sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
|
||||
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
|
||||
for (i = rnk - 1; i > 0; --i) {
|
||||
sz->dims[i - 1].is = sz->dims[i - 1].os =
|
||||
sz->dims[i].is * sz->dims[i].n;
|
||||
sz->dims[i - 1].n = p->sz->dims[i - 1].n;
|
||||
}
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(sz,
|
||||
X(mktensor_1d)(p->vn, 2, 2),
|
||||
ri, ii, ro, io));
|
||||
}
|
||||
else { /* idle process: make nop plan */
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_0d)(),
|
||||
X(mktensor_1d)(0,0,0),
|
||||
ri, ii, ro, io));
|
||||
}
|
||||
if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
|
||||
|
||||
pln = MKPLAN_MPI_DFT(P, &padt, apply);
|
||||
pln->cld = cld;
|
||||
pln->roff = ro - p->O;
|
||||
pln->ioff = io - p->O;
|
||||
X(ops_cpy)(&cld->ops, &pln->super.super.ops);
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
|
||||
return MKSOLVER(solver, &sadt);
|
||||
}
|
||||
|
||||
void XM(dft_serial_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
29
fftw-3.3.10/mpi/dft-solve.c
Normal file
29
fftw-3.3.10/mpi/dft-solve.c
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-dft.h"
|
||||
|
||||
/* use the apply() operation for MPI_DFT problems */
|
||||
void XM(dft_solve)(const plan *ego_, const problem *p_)
|
||||
{
|
||||
const plan_mpi_dft *ego = (const plan_mpi_dft *) ego_;
|
||||
const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
|
||||
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
|
||||
}
|
||||
146
fftw-3.3.10/mpi/dtensor.c
Normal file
146
fftw-3.3.10/mpi/dtensor.c
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
dtensor *XM(mkdtensor)(int rnk)
|
||||
{
|
||||
dtensor *x;
|
||||
|
||||
A(rnk >= 0);
|
||||
|
||||
#if defined(STRUCT_HACK_KR)
|
||||
if (FINITE_RNK(rnk) && rnk > 1)
|
||||
x = (dtensor *)MALLOC(sizeof(dtensor) + (rnk - 1) * sizeof(ddim),
|
||||
TENSORS);
|
||||
else
|
||||
x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
|
||||
#elif defined(STRUCT_HACK_C99)
|
||||
if (FINITE_RNK(rnk))
|
||||
x = (dtensor *)MALLOC(sizeof(dtensor) + rnk * sizeof(ddim),
|
||||
TENSORS);
|
||||
else
|
||||
x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
|
||||
#else
|
||||
x = (dtensor *)MALLOC(sizeof(dtensor), TENSORS);
|
||||
if (FINITE_RNK(rnk) && rnk > 0)
|
||||
x->dims = (ddim *)MALLOC(sizeof(ddim) * rnk, TENSORS);
|
||||
else
|
||||
x->dims = 0;
|
||||
#endif
|
||||
|
||||
x->rnk = rnk;
|
||||
return x;
|
||||
}
|
||||
|
||||
void XM(dtensor_destroy)(dtensor *sz)
|
||||
{
|
||||
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
|
||||
X(ifree0)(sz->dims);
|
||||
#endif
|
||||
X(ifree)(sz);
|
||||
}
|
||||
|
||||
void XM(dtensor_md5)(md5 *p, const dtensor *t)
|
||||
{
|
||||
int i;
|
||||
X(md5int)(p, t->rnk);
|
||||
if (FINITE_RNK(t->rnk)) {
|
||||
for (i = 0; i < t->rnk; ++i) {
|
||||
const ddim *q = t->dims + i;
|
||||
X(md5INT)(p, q->n);
|
||||
X(md5INT)(p, q->b[IB]);
|
||||
X(md5INT)(p, q->b[OB]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dtensor *XM(dtensor_copy)(const dtensor *sz)
|
||||
{
|
||||
dtensor *x = XM(mkdtensor)(sz->rnk);
|
||||
int i;
|
||||
if (FINITE_RNK(sz->rnk))
|
||||
for (i = 0; i < sz->rnk; ++i)
|
||||
x->dims[i] = sz->dims[i];
|
||||
return x;
|
||||
}
|
||||
|
||||
dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress)
|
||||
{
|
||||
int i, rnk;
|
||||
dtensor *x;
|
||||
block_kind k;
|
||||
|
||||
if (!FINITE_RNK(sz->rnk))
|
||||
return XM(mkdtensor)(sz->rnk);
|
||||
for (i = rnk = 0; i < sz->rnk; ++i) {
|
||||
if (sz->dims[i].n <= 0)
|
||||
return XM(mkdtensor)(RNK_MINFTY);
|
||||
else if (!compress || sz->dims[i].n > 1)
|
||||
++rnk;
|
||||
}
|
||||
x = XM(mkdtensor)(rnk);
|
||||
for (i = rnk = 0; i < sz->rnk; ++i) {
|
||||
if (!compress || sz->dims[i].n > 1) {
|
||||
x->dims[rnk].n = sz->dims[i].n;
|
||||
FORALL_BLOCK_KIND(k) {
|
||||
if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]) == 1)
|
||||
x->dims[rnk].b[k] = sz->dims[i].n;
|
||||
else
|
||||
x->dims[rnk].b[k] = sz->dims[i].b[k];
|
||||
}
|
||||
++rnk;
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
int XM(dtensor_validp)(const dtensor *sz)
|
||||
{
|
||||
int i;
|
||||
if (sz->rnk < 0) return 0;
|
||||
if (FINITE_RNK(sz->rnk))
|
||||
for (i = 0; i < sz->rnk; ++i)
|
||||
if (sz->dims[i].n < 0
|
||||
|| sz->dims[i].b[IB] <= 0
|
||||
|| sz->dims[i].b[OB] <= 0)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
void XM(dtensor_print)(const dtensor *t, printer *p)
|
||||
{
|
||||
if (FINITE_RNK(t->rnk)) {
|
||||
int i;
|
||||
int first = 1;
|
||||
p->print(p, "(");
|
||||
for (i = 0; i < t->rnk; ++i) {
|
||||
const ddim *d = t->dims + i;
|
||||
p->print(p, "%s(%D %D %D)",
|
||||
first ? "" : " ",
|
||||
d->n, d->b[IB], d->b[OB]);
|
||||
first = 0;
|
||||
}
|
||||
p->print(p, ")");
|
||||
} else {
|
||||
p->print(p, "rank-minfty");
|
||||
}
|
||||
|
||||
}
|
||||
284
fftw-3.3.10/mpi/f03-wrap.c
Normal file
284
fftw-3.3.10/mpi/f03-wrap.c
Normal file
@@ -0,0 +1,284 @@
|
||||
/* Generated automatically. DO NOT EDIT! */
|
||||
|
||||
#include "fftw3-mpi.h"
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start);
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start);
|
||||
FFTW_EXTERN X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags);
|
||||
FFTW_EXTERN void XM(gather_wisdom_f03)(MPI_Fint f_comm_);
|
||||
FFTW_EXTERN void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_);
|
||||
|
||||
ptrdiff_t XM(local_size_many_transposed_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_many_transposed)(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start,local_n1,local_1_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_many_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block0, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_many)(rnk,n,howmany,block0,comm,local_n0,local_0_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_transposed_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_transposed)(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_f03)(int rnk, const ptrdiff_t * n, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size)(rnk,n,comm,local_n0,local_0_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_many_1d_f03)(ptrdiff_t n0, ptrdiff_t howmany, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_many_1d)(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_1d_f03)(ptrdiff_t n0, MPI_Fint f_comm, int sign, unsigned flags, ptrdiff_t * local_ni, ptrdiff_t * local_i_start, ptrdiff_t * local_no, ptrdiff_t * local_o_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_1d)(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_2d)(n0,n1,comm,local_n0,local_0_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_2d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_2d_transposed)(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_3d)(n0,n1,n2,comm,local_n0,local_0_start);
|
||||
}
|
||||
|
||||
ptrdiff_t XM(local_size_3d_transposed_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Fint f_comm, ptrdiff_t * local_n0, ptrdiff_t * local_0_start, ptrdiff_t * local_n1, ptrdiff_t * local_1_start)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(local_size_3d_transposed)(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_many_transpose)(n0,n1,howmany,block0,block1,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_transpose_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_transpose)(n0,n1,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_dft_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t block, ptrdiff_t tblock, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_many_dft)(rnk,n,howmany,block,tblock,in,out,comm,sign,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft)(rnk,n,in,out,comm,sign,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_1d_f03)(ptrdiff_t n0, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_1d)(n0,in,out,comm,sign,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_2d)(n0,n1,in,out,comm,sign,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, X(complex) * out, MPI_Fint f_comm, int sign, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_3d)(n0,n1,n2,in,out,comm,sign,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_r2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_many_r2r)(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_r2r_f03)(int rnk, const ptrdiff_t * n, R * in, R * out, MPI_Fint f_comm, const X(r2r_kind) * kind, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_r2r)(rnk,n,in,out,comm,kind,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_r2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_r2r_2d)(n0,n1,in,out,comm,kind0,kind1,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_r2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, R * out, MPI_Fint f_comm, X(r2r_kind) kind0, X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_r2r_3d)(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_dft_r2c_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_many_dft_r2c)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_r2c_f03)(int rnk, const ptrdiff_t * n, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_r2c)(rnk,n,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_r2c_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_r2c_2d)(n0,n1,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_r2c_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R * in, X(complex) * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_r2c_3d)(n0,n1,n2,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_many_dft_c2r_f03)(int rnk, const ptrdiff_t * n, ptrdiff_t howmany, ptrdiff_t iblock, ptrdiff_t oblock, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_many_dft_c2r)(rnk,n,howmany,iblock,oblock,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_c2r_f03)(int rnk, const ptrdiff_t * n, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_c2r)(rnk,n,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_c2r_2d_f03)(ptrdiff_t n0, ptrdiff_t n1, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_c2r_2d)(n0,n1,in,out,comm,flags);
|
||||
}
|
||||
|
||||
X(plan) XM(plan_dft_c2r_3d_f03)(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, X(complex) * in, R * out, MPI_Fint f_comm, unsigned flags)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
|
||||
comm = MPI_Comm_f2c(f_comm);
|
||||
return XM(plan_dft_c2r_3d)(n0,n1,n2,in,out,comm,flags);
|
||||
}
|
||||
|
||||
void XM(gather_wisdom_f03)(MPI_Fint f_comm_)
|
||||
{
|
||||
MPI_Comm comm_;
|
||||
|
||||
comm_ = MPI_Comm_f2c(f_comm_);
|
||||
XM(gather_wisdom)(comm_);
|
||||
}
|
||||
|
||||
void XM(broadcast_wisdom_f03)(MPI_Fint f_comm_)
|
||||
{
|
||||
MPI_Comm comm_;
|
||||
|
||||
comm_ = MPI_Comm_f2c(f_comm_);
|
||||
XM(broadcast_wisdom)(comm_);
|
||||
}
|
||||
22
fftw-3.3.10/mpi/f03-wrap.sh
Executable file
22
fftw-3.3.10/mpi/f03-wrap.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#! /bin/sh
|
||||
|
||||
# Script to generate Fortran 2003 wrappers for FFTW's MPI functions. This
|
||||
# is necessary because MPI provides no way to deal with C MPI_Comm handles
|
||||
# from Fortran (where MPI_Comm == integer), but does provide a way to
|
||||
# deal with Fortran MPI_Comm handles from C (via MPI_Comm_f2c). So,
|
||||
# every FFTW function that takes an MPI_Comm argument needs a wrapper
|
||||
# function that takes a Fortran integer and converts it to MPI_Comm.
|
||||
|
||||
echo "/* Generated automatically. DO NOT EDIT! */"
|
||||
echo
|
||||
|
||||
echo "#include \"fftw3-mpi.h\""
|
||||
echo "#include \"ifftw-mpi.h\""
|
||||
echo
|
||||
|
||||
# Declare prototypes using FFTW_EXTERN, important for Windows DLLs
|
||||
grep -v 'mpi.h' fftw3-mpi.h | gcc -E -I../api - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl | grep "MPI_Fint" | sed 's/^/FFTW_EXTERN /;s/$/;/'
|
||||
|
||||
grep -v 'mpi.h' fftw3-mpi.h | gcc -E -I../api - |grep "fftw_mpi_init" |tr ';' '\n' | grep "MPI_Comm" | perl genf03-wrap.pl
|
||||
|
||||
|
||||
43
fftw-3.3.10/mpi/f03api.sh
Executable file
43
fftw-3.3.10/mpi/f03api.sh
Executable file
@@ -0,0 +1,43 @@
|
||||
#! /bin/sh
|
||||
|
||||
# Script to generate Fortran 2003 interface declarations for FFTW's MPI
|
||||
# interface from the fftw3-mpi.h header file.
|
||||
|
||||
# This is designed so that the Fortran caller can do:
|
||||
# use, intrinsic :: iso_c_binding
|
||||
# implicit none
|
||||
# include 'fftw3-mpi.f03'
|
||||
# and then call the C FFTW MPI functions directly, with type checking.
|
||||
#
|
||||
# One caveat: because there is no standard way to conver MPI_Comm objects
|
||||
# from Fortran (= integer) to C (= opaque type), the Fortran interface
|
||||
# technically calls C wrapper functions (also auto-generated) which
|
||||
# call MPI_Comm_f2c to convert the communicators as needed.
|
||||
|
||||
echo "! Generated automatically. DO NOT EDIT!"
|
||||
echo
|
||||
|
||||
echo " include 'fftw3.f03'"
|
||||
echo
|
||||
|
||||
# Extract constants
|
||||
perl -pe 's/#define +([A-Z0-9_]+) +\(([+-]?[0-9]+)U?\)/\n integer\(C_INTPTR_T\), parameter :: \1 = \2\n/g' < fftw3-mpi.h | grep 'integer(C_INTPTR_T)'
|
||||
perl -pe 'if (/#define +([A-Z0-9_]+) +\(([0-9]+)U? *<< *([0-9]+)\)/) { print "\n integer\(C_INT\), parameter :: $1 = ",$2 << $3,"\n"; }' < fftw3-mpi.h | grep 'integer(C_INT)'
|
||||
|
||||
# Extract function declarations
|
||||
for p in $*; do
|
||||
if test "$p" = "d"; then p=""; fi
|
||||
|
||||
echo
|
||||
cat <<EOF
|
||||
type, bind(C) :: fftw${p}_mpi_ddim
|
||||
integer(C_INTPTR_T) n, ib, ob
|
||||
end type fftw${p}_mpi_ddim
|
||||
EOF
|
||||
|
||||
echo
|
||||
echo " interface"
|
||||
grep -v 'mpi.h' fftw3-mpi.h | gcc -I../api -D__GNUC__=5 -D__i386__ -E - |grep "fftw${p}_mpi_init" |tr ';' '\n' | perl ../api/genf03.pl
|
||||
echo " end interface"
|
||||
|
||||
done
|
||||
810
fftw-3.3.10/mpi/fftw3-mpi.f03.in
Normal file
810
fftw-3.3.10/mpi/fftw3-mpi.f03.in
Normal file
@@ -0,0 +1,810 @@
|
||||
! Generated automatically. DO NOT EDIT!
|
||||
|
||||
include 'fftw3.f03'
|
||||
|
||||
integer(C_INTPTR_T), parameter :: FFTW_MPI_DEFAULT_BLOCK = 0
|
||||
integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_IN = 134217728
|
||||
integer(C_INT), parameter :: FFTW_MPI_SCRAMBLED_OUT = 268435456
|
||||
integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_IN = 536870912
|
||||
integer(C_INT), parameter :: FFTW_MPI_TRANSPOSED_OUT = 1073741824
|
||||
|
||||
type, bind(C) :: fftw_mpi_ddim
|
||||
integer(C_INTPTR_T) n, ib, ob
|
||||
end type fftw_mpi_ddim
|
||||
|
||||
interface
|
||||
subroutine fftw_mpi_init() bind(C, name='fftw_mpi_init')
|
||||
import
|
||||
end subroutine fftw_mpi_init
|
||||
|
||||
subroutine fftw_mpi_cleanup() bind(C, name='fftw_mpi_cleanup')
|
||||
import
|
||||
end subroutine fftw_mpi_cleanup
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
|
||||
local_n1,local_1_start) &
|
||||
bind(C, name='fftw_mpi_local_size_many_transposed_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_INTPTR_T), value :: block1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftw_mpi_local_size_many_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftw_mpi_local_size_many_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftw_mpi_local_size_many
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftw_mpi_local_size_transposed_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftw_mpi_local_size_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftw_mpi_local_size_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftw_mpi_local_size
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
|
||||
local_o_start) bind(C, name='fftw_mpi_local_size_many_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
integer(C_INTPTR_T), intent(out) :: local_ni
|
||||
integer(C_INTPTR_T), intent(out) :: local_i_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_no
|
||||
integer(C_INTPTR_T), intent(out) :: local_o_start
|
||||
end function fftw_mpi_local_size_many_1d
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
|
||||
bind(C, name='fftw_mpi_local_size_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
integer(C_INTPTR_T), intent(out) :: local_ni
|
||||
integer(C_INTPTR_T), intent(out) :: local_i_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_no
|
||||
integer(C_INTPTR_T), intent(out) :: local_o_start
|
||||
end function fftw_mpi_local_size_1d
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftw_mpi_local_size_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftw_mpi_local_size_2d
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftw_mpi_local_size_2d_transposed_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftw_mpi_local_size_2d_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftw_mpi_local_size_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftw_mpi_local_size_3d
|
||||
|
||||
integer(C_INTPTR_T) function fftw_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftw_mpi_local_size_3d_transposed_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftw_mpi_local_size_3d_transposed
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
|
||||
bind(C, name='fftw_mpi_plan_many_transpose_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_INTPTR_T), value :: block1
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_many_transpose
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_transpose_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_transpose
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
|
||||
bind(C, name='fftw_mpi_plan_many_dft_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block
|
||||
integer(C_INTPTR_T), value :: tblock
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_many_dft
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_1d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_2d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftw_mpi_plan_dft_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_3d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
|
||||
bind(C, name='fftw_mpi_plan_many_r2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_many_r2r
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftw_mpi_plan_r2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_r2r
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftw_mpi_plan_r2r_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), value :: kind0
|
||||
integer(C_FFTW_R2R_KIND), value :: kind1
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_r2r_2d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) bind(C, name='fftw_mpi_plan_r2r_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), value :: kind0
|
||||
integer(C_FFTW_R2R_KIND), value :: kind1
|
||||
integer(C_FFTW_R2R_KIND), value :: kind2
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_r2r_3d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
|
||||
bind(C, name='fftw_mpi_plan_many_dft_r2c_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_many_dft_r2c
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_r2c
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_r2c_2d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_r2c_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_r2c_3d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
|
||||
bind(C, name='fftw_mpi_plan_many_dft_c2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_many_dft_c2r
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_c2r
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_c2r_2d
|
||||
|
||||
type(C_PTR) function fftw_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftw_mpi_plan_dft_c2r_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftw_mpi_plan_dft_c2r_3d
|
||||
|
||||
subroutine fftw_mpi_gather_wisdom(comm_) bind(C, name='fftw_mpi_gather_wisdom_f03')
|
||||
import
|
||||
integer(C_MPI_FINT), value :: comm_
|
||||
end subroutine fftw_mpi_gather_wisdom
|
||||
|
||||
subroutine fftw_mpi_broadcast_wisdom(comm_) bind(C, name='fftw_mpi_broadcast_wisdom_f03')
|
||||
import
|
||||
integer(C_MPI_FINT), value :: comm_
|
||||
end subroutine fftw_mpi_broadcast_wisdom
|
||||
|
||||
subroutine fftw_mpi_execute_dft(p,in,out) bind(C, name='fftw_mpi_execute_dft')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
end subroutine fftw_mpi_execute_dft
|
||||
|
||||
subroutine fftw_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftw_mpi_execute_dft_r2c')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
real(C_DOUBLE), dimension(*), intent(inout) :: in
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
end subroutine fftw_mpi_execute_dft_r2c
|
||||
|
||||
subroutine fftw_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftw_mpi_execute_dft_c2r')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
complex(C_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
end subroutine fftw_mpi_execute_dft_c2r
|
||||
|
||||
subroutine fftw_mpi_execute_r2r(p,in,out) bind(C, name='fftw_mpi_execute_r2r')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
real(C_DOUBLE), dimension(*), intent(inout) :: in
|
||||
real(C_DOUBLE), dimension(*), intent(out) :: out
|
||||
end subroutine fftw_mpi_execute_r2r
|
||||
|
||||
end interface
|
||||
|
||||
type, bind(C) :: fftwf_mpi_ddim
|
||||
integer(C_INTPTR_T) n, ib, ob
|
||||
end type fftwf_mpi_ddim
|
||||
|
||||
interface
|
||||
subroutine fftwf_mpi_init() bind(C, name='fftwf_mpi_init')
|
||||
import
|
||||
end subroutine fftwf_mpi_init
|
||||
|
||||
subroutine fftwf_mpi_cleanup() bind(C, name='fftwf_mpi_cleanup')
|
||||
import
|
||||
end subroutine fftwf_mpi_cleanup
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
|
||||
local_n1,local_1_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_many_transposed_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_INTPTR_T), value :: block1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwf_mpi_local_size_many_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_many_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwf_mpi_local_size_many
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_transposed_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwf_mpi_local_size_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwf_mpi_local_size_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwf_mpi_local_size
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
|
||||
local_o_start) bind(C, name='fftwf_mpi_local_size_many_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
integer(C_INTPTR_T), intent(out) :: local_ni
|
||||
integer(C_INTPTR_T), intent(out) :: local_i_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_no
|
||||
integer(C_INTPTR_T), intent(out) :: local_o_start
|
||||
end function fftwf_mpi_local_size_many_1d
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
integer(C_INTPTR_T), intent(out) :: local_ni
|
||||
integer(C_INTPTR_T), intent(out) :: local_i_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_no
|
||||
integer(C_INTPTR_T), intent(out) :: local_o_start
|
||||
end function fftwf_mpi_local_size_1d
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwf_mpi_local_size_2d
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_2d_transposed_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwf_mpi_local_size_2d_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwf_mpi_local_size_3d
|
||||
|
||||
integer(C_INTPTR_T) function fftwf_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftwf_mpi_local_size_3d_transposed_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwf_mpi_local_size_3d_transposed
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
|
||||
bind(C, name='fftwf_mpi_plan_many_transpose_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_INTPTR_T), value :: block1
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_many_transpose
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_transpose_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_transpose
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
|
||||
bind(C, name='fftwf_mpi_plan_many_dft_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block
|
||||
integer(C_INTPTR_T), value :: tblock
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_many_dft
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_1d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_2d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwf_mpi_plan_dft_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_3d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
|
||||
bind(C, name='fftwf_mpi_plan_many_r2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_many_r2r
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwf_mpi_plan_r2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_r2r
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwf_mpi_plan_r2r_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), value :: kind0
|
||||
integer(C_FFTW_R2R_KIND), value :: kind1
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_r2r_2d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
|
||||
bind(C, name='fftwf_mpi_plan_r2r_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), value :: kind0
|
||||
integer(C_FFTW_R2R_KIND), value :: kind1
|
||||
integer(C_FFTW_R2R_KIND), value :: kind2
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_r2r_3d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
|
||||
bind(C, name='fftwf_mpi_plan_many_dft_r2c_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_many_dft_r2c
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_r2c
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_r2c_2d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_r2c_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
real(C_FLOAT), dimension(*), intent(out) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_r2c_3d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
|
||||
bind(C, name='fftwf_mpi_plan_many_dft_c2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_many_dft_c2r
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_c2r
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_c2r_2d
|
||||
|
||||
type(C_PTR) function fftwf_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwf_mpi_plan_dft_c2r_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwf_mpi_plan_dft_c2r_3d
|
||||
|
||||
subroutine fftwf_mpi_gather_wisdom(comm_) bind(C, name='fftwf_mpi_gather_wisdom_f03')
|
||||
import
|
||||
integer(C_MPI_FINT), value :: comm_
|
||||
end subroutine fftwf_mpi_gather_wisdom
|
||||
|
||||
subroutine fftwf_mpi_broadcast_wisdom(comm_) bind(C, name='fftwf_mpi_broadcast_wisdom_f03')
|
||||
import
|
||||
integer(C_MPI_FINT), value :: comm_
|
||||
end subroutine fftwf_mpi_broadcast_wisdom
|
||||
|
||||
subroutine fftwf_mpi_execute_dft(p,in,out) bind(C, name='fftwf_mpi_execute_dft')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
end subroutine fftwf_mpi_execute_dft
|
||||
|
||||
subroutine fftwf_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwf_mpi_execute_dft_r2c')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
real(C_FLOAT), dimension(*), intent(inout) :: in
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(out) :: out
|
||||
end subroutine fftwf_mpi_execute_dft_r2c
|
||||
|
||||
subroutine fftwf_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwf_mpi_execute_dft_c2r')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
complex(C_FLOAT_COMPLEX), dimension(*), intent(inout) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
end subroutine fftwf_mpi_execute_dft_c2r
|
||||
|
||||
subroutine fftwf_mpi_execute_r2r(p,in,out) bind(C, name='fftwf_mpi_execute_r2r')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
real(C_FLOAT), dimension(*), intent(inout) :: in
|
||||
real(C_FLOAT), dimension(*), intent(out) :: out
|
||||
end subroutine fftwf_mpi_execute_r2r
|
||||
|
||||
end interface
|
||||
221
fftw-3.3.10/mpi/fftw3-mpi.h
Normal file
221
fftw-3.3.10/mpi/fftw3-mpi.h
Normal file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* The following statement of license applies *only* to this header file,
|
||||
* and *not* to the other files distributed with FFTW or derived therefrom:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/***************************** NOTE TO USERS *********************************
|
||||
*
|
||||
* THIS IS A HEADER FILE, NOT A MANUAL
|
||||
*
|
||||
* If you want to know how to use FFTW, please read the manual,
|
||||
* online at http://www.fftw.org/doc/ and also included with FFTW.
|
||||
* For a quick start, see the manual's tutorial section.
|
||||
*
|
||||
* (Reading header files to learn how to use a library is a habit
|
||||
* stemming from code lacking a proper manual. Arguably, it's a
|
||||
* *bad* habit in most cases, because header files can contain
|
||||
* interfaces that are not part of the public, stable API.)
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef FFTW3_MPI_H
|
||||
#define FFTW3_MPI_H
|
||||
|
||||
#include <fftw3.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif /* __cplusplus */
|
||||
|
||||
struct fftw_mpi_ddim_do_not_use_me {
|
||||
ptrdiff_t n; /* dimension size */
|
||||
ptrdiff_t ib; /* input block */
|
||||
ptrdiff_t ob; /* output block */
|
||||
};
|
||||
|
||||
/*
|
||||
huge second-order macro that defines prototypes for all API
|
||||
functions. We expand this macro for each supported precision
|
||||
|
||||
XM: name-mangling macro (MPI)
|
||||
X: name-mangling macro (serial)
|
||||
R: real data type
|
||||
C: complex data type
|
||||
*/
|
||||
|
||||
#define FFTW_MPI_DEFINE_API(XM, X, R, C) \
|
||||
\
|
||||
typedef struct fftw_mpi_ddim_do_not_use_me XM(ddim); \
|
||||
\
|
||||
FFTW_EXTERN void XM(init)(void); \
|
||||
FFTW_EXTERN void XM(cleanup)(void); \
|
||||
\
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_many_transposed) \
|
||||
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
|
||||
ptrdiff_t block0, ptrdiff_t block1, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
|
||||
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_many) \
|
||||
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
|
||||
ptrdiff_t block0, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_transposed) \
|
||||
(int rnk, const ptrdiff_t *n, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
|
||||
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size) \
|
||||
(int rnk, const ptrdiff_t *n, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_many_1d)( \
|
||||
ptrdiff_t n0, ptrdiff_t howmany, \
|
||||
MPI_Comm comm, int sign, unsigned flags, \
|
||||
ptrdiff_t *local_ni, ptrdiff_t *local_i_start, \
|
||||
ptrdiff_t *local_no, ptrdiff_t *local_o_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_1d)( \
|
||||
ptrdiff_t n0, MPI_Comm comm, int sign, unsigned flags, \
|
||||
ptrdiff_t *local_ni, ptrdiff_t *local_i_start, \
|
||||
ptrdiff_t *local_no, ptrdiff_t *local_o_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_2d)( \
|
||||
ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_2d_transposed)( \
|
||||
ptrdiff_t n0, ptrdiff_t n1, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
|
||||
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_3d)( \
|
||||
ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start); \
|
||||
FFTW_EXTERN ptrdiff_t XM(local_size_3d_transposed)( \
|
||||
ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, MPI_Comm comm, \
|
||||
ptrdiff_t *local_n0, ptrdiff_t *local_0_start, \
|
||||
ptrdiff_t *local_n1, ptrdiff_t *local_1_start); \
|
||||
\
|
||||
FFTW_EXTERN X(plan) XM(plan_many_transpose) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, \
|
||||
ptrdiff_t howmany, ptrdiff_t block0, ptrdiff_t block1, \
|
||||
R *in, R *out, MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_transpose) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, \
|
||||
R *in, R *out, MPI_Comm comm, unsigned flags); \
|
||||
\
|
||||
FFTW_EXTERN X(plan) XM(plan_many_dft) \
|
||||
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
|
||||
ptrdiff_t block, ptrdiff_t tblock, C *in, C *out, \
|
||||
MPI_Comm comm, int sign, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft) \
|
||||
(int rnk, const ptrdiff_t *n, C *in, C *out, \
|
||||
MPI_Comm comm, int sign, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_1d) \
|
||||
(ptrdiff_t n0, C *in, C *out, \
|
||||
MPI_Comm comm, int sign, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_2d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, C *in, C *out, \
|
||||
MPI_Comm comm, int sign, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_3d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, C *out, \
|
||||
MPI_Comm comm, int sign, unsigned flags); \
|
||||
\
|
||||
FFTW_EXTERN X(plan) XM(plan_many_r2r) \
|
||||
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
|
||||
ptrdiff_t iblock, ptrdiff_t oblock, R *in, R *out, \
|
||||
MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_r2r) \
|
||||
(int rnk, const ptrdiff_t *n, R *in, R *out, \
|
||||
MPI_Comm comm, const X(r2r_kind) *kind, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_r2r_2d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, R *in, R *out, MPI_Comm comm, \
|
||||
X(r2r_kind) kind0, X(r2r_kind) kind1, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_r2r_3d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, \
|
||||
R *in, R *out, MPI_Comm comm, X(r2r_kind) kind0, \
|
||||
X(r2r_kind) kind1, X(r2r_kind) kind2, unsigned flags); \
|
||||
\
|
||||
FFTW_EXTERN X(plan) XM(plan_many_dft_r2c) \
|
||||
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
|
||||
ptrdiff_t iblock, ptrdiff_t oblock, R *in, C *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_r2c) \
|
||||
(int rnk, const ptrdiff_t *n, R *in, C *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_r2c_2d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, R *in, C *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_r2c_3d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, R *in, C *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
\
|
||||
FFTW_EXTERN X(plan) XM(plan_many_dft_c2r) \
|
||||
(int rnk, const ptrdiff_t *n, ptrdiff_t howmany, \
|
||||
ptrdiff_t iblock, ptrdiff_t oblock, C *in, R *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_c2r) \
|
||||
(int rnk, const ptrdiff_t *n, C *in, R *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_c2r_2d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, C *in, R *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
FFTW_EXTERN X(plan) XM(plan_dft_c2r_3d) \
|
||||
(ptrdiff_t n0, ptrdiff_t n1, ptrdiff_t n2, C *in, R *out, \
|
||||
MPI_Comm comm, unsigned flags); \
|
||||
\
|
||||
FFTW_EXTERN void XM(gather_wisdom)(MPI_Comm comm_); \
|
||||
FFTW_EXTERN void XM(broadcast_wisdom)(MPI_Comm comm_); \
|
||||
\
|
||||
FFTW_EXTERN void XM(execute_dft)(X(plan) p, C *in, C *out); \
|
||||
FFTW_EXTERN void XM(execute_dft_r2c)(X(plan) p, R *in, C *out); \
|
||||
FFTW_EXTERN void XM(execute_dft_c2r)(X(plan) p, C *in, R *out); \
|
||||
FFTW_EXTERN void XM(execute_r2r)(X(plan) p, R *in, R *out);
|
||||
|
||||
|
||||
|
||||
/* end of FFTW_MPI_DEFINE_API macro */
|
||||
|
||||
#define FFTW_MPI_MANGLE_DOUBLE(name) FFTW_MANGLE_DOUBLE(FFTW_CONCAT(mpi_,name))
|
||||
#define FFTW_MPI_MANGLE_FLOAT(name) FFTW_MANGLE_FLOAT(FFTW_CONCAT(mpi_,name))
|
||||
#define FFTW_MPI_MANGLE_LONG_DOUBLE(name) FFTW_MANGLE_LONG_DOUBLE(FFTW_CONCAT(mpi_,name))
|
||||
|
||||
FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_DOUBLE, FFTW_MANGLE_DOUBLE, double, fftw_complex)
|
||||
FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_FLOAT, FFTW_MANGLE_FLOAT, float, fftwf_complex)
|
||||
FFTW_MPI_DEFINE_API(FFTW_MPI_MANGLE_LONG_DOUBLE, FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
|
||||
|
||||
#define FFTW_MPI_DEFAULT_BLOCK (0)
|
||||
|
||||
/* MPI-specific flags */
|
||||
#define FFTW_MPI_SCRAMBLED_IN (1U << 27)
|
||||
#define FFTW_MPI_SCRAMBLED_OUT (1U << 28)
|
||||
#define FFTW_MPI_TRANSPOSED_IN (1U << 29)
|
||||
#define FFTW_MPI_TRANSPOSED_OUT (1U << 30)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* FFTW3_MPI_H */
|
||||
405
fftw-3.3.10/mpi/fftw3l-mpi.f03.in
Normal file
405
fftw-3.3.10/mpi/fftw3l-mpi.f03.in
Normal file
@@ -0,0 +1,405 @@
|
||||
! Generated automatically. DO NOT EDIT!
|
||||
|
||||
include 'fftw3l.f03'
|
||||
|
||||
|
||||
type, bind(C) :: fftwl_mpi_ddim
|
||||
integer(C_INTPTR_T) n, ib, ob
|
||||
end type fftwl_mpi_ddim
|
||||
|
||||
interface
|
||||
subroutine fftwl_mpi_init() bind(C, name='fftwl_mpi_init')
|
||||
import
|
||||
end subroutine fftwl_mpi_init
|
||||
|
||||
subroutine fftwl_mpi_cleanup() bind(C, name='fftwl_mpi_cleanup')
|
||||
import
|
||||
end subroutine fftwl_mpi_cleanup
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_many_transposed(rnk,n,howmany,block0,block1,comm,local_n0,local_0_start, &
|
||||
local_n1,local_1_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_many_transposed_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_INTPTR_T), value :: block1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwl_mpi_local_size_many_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_many(rnk,n,howmany,block0,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_many_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwl_mpi_local_size_many
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_transposed(rnk,n,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_transposed_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwl_mpi_local_size_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size(rnk,n,comm,local_n0,local_0_start) bind(C, name='fftwl_mpi_local_size_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwl_mpi_local_size
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_many_1d(n0,howmany,comm,sign,flags,local_ni,local_i_start,local_no, &
|
||||
local_o_start) bind(C, name='fftwl_mpi_local_size_many_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
integer(C_INTPTR_T), intent(out) :: local_ni
|
||||
integer(C_INTPTR_T), intent(out) :: local_i_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_no
|
||||
integer(C_INTPTR_T), intent(out) :: local_o_start
|
||||
end function fftwl_mpi_local_size_many_1d
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_1d(n0,comm,sign,flags,local_ni,local_i_start,local_no,local_o_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
integer(C_INTPTR_T), intent(out) :: local_ni
|
||||
integer(C_INTPTR_T), intent(out) :: local_i_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_no
|
||||
integer(C_INTPTR_T), intent(out) :: local_o_start
|
||||
end function fftwl_mpi_local_size_1d
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_2d(n0,n1,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwl_mpi_local_size_2d
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_2d_transposed(n0,n1,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_2d_transposed_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwl_mpi_local_size_2d_transposed
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_3d(n0,n1,n2,comm,local_n0,local_0_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
end function fftwl_mpi_local_size_3d
|
||||
|
||||
integer(C_INTPTR_T) function fftwl_mpi_local_size_3d_transposed(n0,n1,n2,comm,local_n0,local_0_start,local_n1,local_1_start) &
|
||||
bind(C, name='fftwl_mpi_local_size_3d_transposed_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INTPTR_T), intent(out) :: local_n0
|
||||
integer(C_INTPTR_T), intent(out) :: local_0_start
|
||||
integer(C_INTPTR_T), intent(out) :: local_n1
|
||||
integer(C_INTPTR_T), intent(out) :: local_1_start
|
||||
end function fftwl_mpi_local_size_3d_transposed
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_many_transpose(n0,n1,howmany,block0,block1,in,out,comm,flags) &
|
||||
bind(C, name='fftwl_mpi_plan_many_transpose_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block0
|
||||
integer(C_INTPTR_T), value :: block1
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_many_transpose
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_transpose(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_transpose_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_transpose
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_many_dft(rnk,n,howmany,block,tblock,in,out,comm,sign,flags) &
|
||||
bind(C, name='fftwl_mpi_plan_many_dft_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: block
|
||||
integer(C_INTPTR_T), value :: tblock
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_many_dft
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft(rnk,n,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_1d(n0,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_1d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_1d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_2d(n0,n1,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_2d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_3d(n0,n1,n2,in,out,comm,sign,flags) bind(C, name='fftwl_mpi_plan_dft_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: sign
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_3d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_many_r2r(rnk,n,howmany,iblock,oblock,in,out,comm,kind,flags) &
|
||||
bind(C, name='fftwl_mpi_plan_many_r2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_many_r2r
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_r2r(rnk,n,in,out,comm,kind,flags) bind(C, name='fftwl_mpi_plan_r2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), dimension(*), intent(in) :: kind
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_r2r
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_r2r_2d(n0,n1,in,out,comm,kind0,kind1,flags) bind(C, name='fftwl_mpi_plan_r2r_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), value :: kind0
|
||||
integer(C_FFTW_R2R_KIND), value :: kind1
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_r2r_2d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_r2r_3d(n0,n1,n2,in,out,comm,kind0,kind1,kind2,flags) &
|
||||
bind(C, name='fftwl_mpi_plan_r2r_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_FFTW_R2R_KIND), value :: kind0
|
||||
integer(C_FFTW_R2R_KIND), value :: kind1
|
||||
integer(C_FFTW_R2R_KIND), value :: kind2
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_r2r_3d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_many_dft_r2c(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
|
||||
bind(C, name='fftwl_mpi_plan_many_dft_r2c_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_many_dft_r2c
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_r2c(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_r2c
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_r2c_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_r2c_2d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_r2c_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_r2c_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_r2c_3d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_many_dft_c2r(rnk,n,howmany,iblock,oblock,in,out,comm,flags) &
|
||||
bind(C, name='fftwl_mpi_plan_many_dft_c2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
integer(C_INTPTR_T), value :: howmany
|
||||
integer(C_INTPTR_T), value :: iblock
|
||||
integer(C_INTPTR_T), value :: oblock
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_many_dft_c2r
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_c2r(rnk,n,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_f03')
|
||||
import
|
||||
integer(C_INT), value :: rnk
|
||||
integer(C_INTPTR_T), dimension(*), intent(in) :: n
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_c2r
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_c2r_2d(n0,n1,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_2d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_c2r_2d
|
||||
|
||||
type(C_PTR) function fftwl_mpi_plan_dft_c2r_3d(n0,n1,n2,in,out,comm,flags) bind(C, name='fftwl_mpi_plan_dft_c2r_3d_f03')
|
||||
import
|
||||
integer(C_INTPTR_T), value :: n0
|
||||
integer(C_INTPTR_T), value :: n1
|
||||
integer(C_INTPTR_T), value :: n2
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
integer(C_MPI_FINT), value :: comm
|
||||
integer(C_INT), value :: flags
|
||||
end function fftwl_mpi_plan_dft_c2r_3d
|
||||
|
||||
subroutine fftwl_mpi_gather_wisdom(comm_) bind(C, name='fftwl_mpi_gather_wisdom_f03')
|
||||
import
|
||||
integer(C_MPI_FINT), value :: comm_
|
||||
end subroutine fftwl_mpi_gather_wisdom
|
||||
|
||||
subroutine fftwl_mpi_broadcast_wisdom(comm_) bind(C, name='fftwl_mpi_broadcast_wisdom_f03')
|
||||
import
|
||||
integer(C_MPI_FINT), value :: comm_
|
||||
end subroutine fftwl_mpi_broadcast_wisdom
|
||||
|
||||
subroutine fftwl_mpi_execute_dft(p,in,out) bind(C, name='fftwl_mpi_execute_dft')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
end subroutine fftwl_mpi_execute_dft
|
||||
|
||||
subroutine fftwl_mpi_execute_dft_r2c(p,in,out) bind(C, name='fftwl_mpi_execute_dft_r2c')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(out) :: out
|
||||
end subroutine fftwl_mpi_execute_dft_r2c
|
||||
|
||||
subroutine fftwl_mpi_execute_dft_c2r(p,in,out) bind(C, name='fftwl_mpi_execute_dft_c2r')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
complex(C_LONG_DOUBLE_COMPLEX), dimension(*), intent(inout) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
end subroutine fftwl_mpi_execute_dft_c2r
|
||||
|
||||
subroutine fftwl_mpi_execute_r2r(p,in,out) bind(C, name='fftwl_mpi_execute_r2r')
|
||||
import
|
||||
type(C_PTR), value :: p
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(inout) :: in
|
||||
real(C_LONG_DOUBLE), dimension(*), intent(out) :: out
|
||||
end subroutine fftwl_mpi_execute_r2r
|
||||
|
||||
end interface
|
||||
78
fftw-3.3.10/mpi/genf03-wrap.pl
Executable file
78
fftw-3.3.10/mpi/genf03-wrap.pl
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/perl -w
|
||||
# Generate Fortran 2003 wrappers (which translate MPI_Comm from f2c) from
|
||||
# function declarations of the form (one per line):
|
||||
# extern <type> fftw_mpi_<name>(...args...)
|
||||
# extern <type> fftw_mpi_<name>(...args...)
|
||||
# ...
|
||||
# with no line breaks within a given function. (It's too much work to
|
||||
# write a general parser, since we just have to handle FFTW's header files.)
|
||||
# Each declaration has at least one MPI_Comm argument.
|
||||
|
||||
sub canonicalize_type {
|
||||
my($type);
|
||||
($type) = @_;
|
||||
$type =~ s/ +/ /g;
|
||||
$type =~ s/^ //;
|
||||
$type =~ s/ $//;
|
||||
$type =~ s/([^\* ])\*/$1 \*/g;
|
||||
$type =~ s/double/R/;
|
||||
$type =~ s/fftw_([A-Za-z0-9_]+)/X(\1)/;
|
||||
return $type;
|
||||
}
|
||||
|
||||
while (<>) {
|
||||
next if /^ *$/;
|
||||
if (/^ *extern +([a-zA-Z_0-9 ]+[ \*]) *fftw_mpi_([a-zA-Z_0-9]+) *\((.*)\) *$/) {
|
||||
$ret = &canonicalize_type($1);
|
||||
$name = $2;
|
||||
|
||||
$args = $3;
|
||||
|
||||
|
||||
print "\n$ret XM(${name}_f03)(";
|
||||
|
||||
$comma = "";
|
||||
foreach $arg (split(/ *, */, $args)) {
|
||||
$arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
|
||||
$argtype = &canonicalize_type($1);
|
||||
$argname = $2;
|
||||
print $comma;
|
||||
if ($argtype eq "MPI_Comm") {
|
||||
print "MPI_Fint f_$argname";
|
||||
}
|
||||
else {
|
||||
print "$argtype $argname";
|
||||
}
|
||||
$comma = ", ";
|
||||
}
|
||||
print ")\n{\n";
|
||||
|
||||
print " MPI_Comm ";
|
||||
$comma = "";
|
||||
foreach $arg (split(/ *, */, $args)) {
|
||||
$arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
|
||||
$argtype = &canonicalize_type($1);
|
||||
$argname = $2;
|
||||
if ($argtype eq "MPI_Comm") {
|
||||
print "$comma$argname";
|
||||
$comma = ", ";
|
||||
}
|
||||
}
|
||||
print ";\n\n";
|
||||
|
||||
foreach $arg (split(/ *, */, $args)) {
|
||||
$arg =~ /^([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) *$/;
|
||||
$argtype = &canonicalize_type($1);
|
||||
$argname = $2;
|
||||
if ($argtype eq "MPI_Comm") {
|
||||
print " $argname = MPI_Comm_f2c(f_$argname);\n";
|
||||
}
|
||||
}
|
||||
|
||||
$argnames = $args;
|
||||
$argnames =~ s/([a-zA-Z_0-9 ]+[ \*]) *([a-zA-Z_0-9]+) */$2/g;
|
||||
print " ";
|
||||
print "return " if ($ret ne "void");
|
||||
print "XM($name)($argnames);\n}\n";
|
||||
}
|
||||
}
|
||||
151
fftw-3.3.10/mpi/ifftw-mpi.h
Normal file
151
fftw-3.3.10/mpi/ifftw-mpi.h
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* FFTW-MPI internal header file */
|
||||
#ifndef __IFFTW_MPI_H__
|
||||
#define __IFFTW_MPI_H__
|
||||
|
||||
#include "kernel/ifftw.h"
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
/* mpi problem flags: problem-dependent meaning, but in general
|
||||
SCRAMBLED means some reordering *within* the dimensions, while
|
||||
TRANSPOSED means some reordering *of* the dimensions */
|
||||
#define SCRAMBLED_IN (1 << 0)
|
||||
#define SCRAMBLED_OUT (1 << 1)
|
||||
#define TRANSPOSED_IN (1 << 2)
|
||||
#define TRANSPOSED_OUT (1 << 3)
|
||||
#define RANK1_BIGVEC_ONLY (1 << 4) /* for rank=1, allow only bigvec solver */
|
||||
|
||||
#define ONLY_SCRAMBLEDP(flags) (!((flags) & ~(SCRAMBLED_IN|SCRAMBLED_OUT)))
|
||||
#define ONLY_TRANSPOSEDP(flags) (!((flags) & ~(TRANSPOSED_IN|TRANSPOSED_OUT)))
|
||||
|
||||
#if defined(FFTW_SINGLE)
|
||||
# define FFTW_MPI_TYPE MPI_FLOAT
|
||||
#elif defined(FFTW_LDOUBLE)
|
||||
# define FFTW_MPI_TYPE MPI_LONG_DOUBLE
|
||||
#elif defined(FFTW_QUAD)
|
||||
# error MPI quad-precision type is unknown
|
||||
#else
|
||||
# define FFTW_MPI_TYPE MPI_DOUBLE
|
||||
#endif
|
||||
|
||||
/* all fftw-mpi identifiers start with fftw_mpi (or fftwf_mpi etc.) */
|
||||
#define XM(name) X(CONCAT(mpi_, name))
|
||||
|
||||
/***********************************************************************/
|
||||
/* block distributions */
|
||||
|
||||
/* a distributed dimension of length n with input and output block
|
||||
sizes ib and ob, respectively. */
|
||||
typedef enum { IB = 0, OB } block_kind;
|
||||
typedef struct {
|
||||
INT n;
|
||||
INT b[2]; /* b[IB], b[OB] */
|
||||
} ddim;
|
||||
|
||||
/* Loop over k in {IB, OB}. Note: need explicit casts for C++. */
|
||||
#define FORALL_BLOCK_KIND(k) for (k = IB; k <= OB; k = (block_kind) (((int) k) + 1))
|
||||
|
||||
/* unlike tensors in the serial FFTW, the ordering of the dtensor
|
||||
dimensions matters - both the array and the block layout are
|
||||
row-major order. */
|
||||
typedef struct {
|
||||
int rnk;
|
||||
#if defined(STRUCT_HACK_KR)
|
||||
ddim dims[1];
|
||||
#elif defined(STRUCT_HACK_C99)
|
||||
ddim dims[];
|
||||
#else
|
||||
ddim *dims;
|
||||
#endif
|
||||
} dtensor;
|
||||
|
||||
|
||||
/* dtensor.c: */
|
||||
dtensor *XM(mkdtensor)(int rnk);
|
||||
void XM(dtensor_destroy)(dtensor *sz);
|
||||
dtensor *XM(dtensor_copy)(const dtensor *sz);
|
||||
dtensor *XM(dtensor_canonical)(const dtensor *sz, int compress);
|
||||
int XM(dtensor_validp)(const dtensor *sz);
|
||||
void XM(dtensor_md5)(md5 *p, const dtensor *t);
|
||||
void XM(dtensor_print)(const dtensor *t, printer *p);
|
||||
|
||||
/* block.c: */
|
||||
|
||||
/* for a single distributed dimension: */
|
||||
INT XM(num_blocks)(INT n, INT block);
|
||||
int XM(num_blocks_ok)(INT n, INT block, MPI_Comm comm);
|
||||
INT XM(default_block)(INT n, int n_pes);
|
||||
INT XM(block)(INT n, INT block, int which_block);
|
||||
|
||||
/* for multiple distributed dimensions: */
|
||||
INT XM(num_blocks_total)(const dtensor *sz, block_kind k);
|
||||
int XM(idle_process)(const dtensor *sz, block_kind k, int which_pe);
|
||||
void XM(block_coords)(const dtensor *sz, block_kind k, int which_pe,
|
||||
INT *coords);
|
||||
INT XM(total_block)(const dtensor *sz, block_kind k, int which_pe);
|
||||
int XM(is_local_after)(int dim, const dtensor *sz, block_kind k);
|
||||
int XM(is_local)(const dtensor *sz, block_kind k);
|
||||
int XM(is_block1d)(const dtensor *sz, block_kind k);
|
||||
|
||||
/* choose-radix.c */
|
||||
INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
|
||||
INT rblock[2], INT mblock[2]);
|
||||
|
||||
/***********************************************************************/
|
||||
/* any_true.c */
|
||||
int XM(any_true)(int condition, MPI_Comm comm);
|
||||
int XM(md5_equal)(md5 m, MPI_Comm comm);
|
||||
|
||||
/* conf.c */
|
||||
void XM(conf_standard)(planner *p);
|
||||
|
||||
/***********************************************************************/
|
||||
/* rearrange.c */
|
||||
|
||||
/* Different ways to rearrange the vector dimension vn during transposition,
|
||||
reflecting different tradeoffs between ease of transposition and
|
||||
contiguity during the subsequent DFTs.
|
||||
|
||||
TODO: can we pare this down to CONTIG and DISCONTIG, at least
|
||||
in MEASURE mode? SQUARE_MIDDLE is also used for 1d destroy-input DFTs. */
|
||||
typedef enum {
|
||||
CONTIG = 0, /* vn x 1: make subsequent DFTs contiguous */
|
||||
DISCONTIG, /* P x (vn/P) for P processes */
|
||||
SQUARE_BEFORE, /* try to get square transpose at beginning */
|
||||
SQUARE_MIDDLE, /* try to get square transpose in the middle */
|
||||
SQUARE_AFTER /* try to get square transpose at end */
|
||||
} rearrangement;
|
||||
|
||||
/* skipping SQUARE_AFTER since it doesn't seem to offer any advantage
|
||||
over SQUARE_BEFORE */
|
||||
#define FORALL_REARRANGE(rearrange) for (rearrange = CONTIG; rearrange <= SQUARE_MIDDLE; rearrange = (rearrangement) (((int) rearrange) + 1))
|
||||
|
||||
int XM(rearrange_applicable)(rearrangement rearrange,
|
||||
ddim dim0, INT vn, int n_pes);
|
||||
INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes);
|
||||
|
||||
/***********************************************************************/
|
||||
|
||||
#endif /* __IFFTW_MPI_H__ */
|
||||
|
||||
844
fftw-3.3.10/mpi/mpi-bench.c
Normal file
844
fftw-3.3.10/mpi/mpi-bench.c
Normal file
@@ -0,0 +1,844 @@
|
||||
/**************************************************************************/
|
||||
/* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
|
||||
It is probably NOT a good place to learn FFTW usage, since it has a
|
||||
lot of added complexity in order to exercise and test the full API,
|
||||
etcetera. We suggest reading the manual. */
|
||||
/**************************************************************************/
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "fftw3-mpi.h"
|
||||
#include "tests/fftw-bench.h"
|
||||
|
||||
#if defined(BENCHFFT_SINGLE)
|
||||
# define BENCH_MPI_TYPE MPI_FLOAT
|
||||
#elif defined(BENCHFFT_LDOUBLE)
|
||||
# define BENCH_MPI_TYPE MPI_LONG_DOUBLE
|
||||
#elif defined(BENCHFFT_QUAD)
|
||||
# error MPI quad-precision type is unknown
|
||||
#else
|
||||
# define BENCH_MPI_TYPE MPI_DOUBLE
|
||||
#endif
|
||||
|
||||
#if SIZEOF_PTRDIFF_T == SIZEOF_INT
|
||||
# define FFTW_MPI_PTRDIFF_T MPI_INT
|
||||
#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
|
||||
# define FFTW_MPI_PTRDIFF_T MPI_LONG
|
||||
#elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
|
||||
# define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
|
||||
#else
|
||||
# error MPI type for ptrdiff_t is unknown
|
||||
# define FFTW_MPI_PTRDIFF_T MPI_LONG
|
||||
#endif
|
||||
|
||||
static const char *mkversion(void) { return FFTW(version); }
|
||||
static const char *mkcc(void) { return FFTW(cc); }
|
||||
static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
|
||||
static const char *mknproc(void) {
|
||||
static char buf[32];
|
||||
int ncpus;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
|
||||
#ifdef HAVE_SNPRINTF
|
||||
snprintf(buf, 32, "%d", ncpus);
|
||||
#else
|
||||
sprintf(buf, "%d", ncpus);
|
||||
#endif
|
||||
return buf;
|
||||
}
|
||||
|
||||
BEGIN_BENCH_DOC
|
||||
BENCH_DOC("name", "fftw3_mpi")
|
||||
BENCH_DOCF("version", mkversion)
|
||||
BENCH_DOCF("cc", mkcc)
|
||||
BENCH_DOCF("codelet-optim", mkcodelet_optim)
|
||||
BENCH_DOCF("nproc", mknproc)
|
||||
END_BENCH_DOC
|
||||
|
||||
static int n_pes = 1, my_pe = 0;
|
||||
|
||||
/* global variables describing the shape of the data and its distribution */
|
||||
static int rnk;
|
||||
static ptrdiff_t vn, iNtot, oNtot;
|
||||
static ptrdiff_t *local_ni=0, *local_starti=0;
|
||||
static ptrdiff_t *local_no=0, *local_starto=0;
|
||||
static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
|
||||
static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
|
||||
static ptrdiff_t *istrides = 0, *ostrides = 0;
|
||||
static ptrdiff_t *total_ni=0, *total_no=0;
|
||||
static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
|
||||
static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
|
||||
|
||||
static bench_real *local_in = 0, *local_out = 0;
|
||||
static bench_real *all_local_in = 0, *all_local_out = 0;
|
||||
static int all_local_in_alloc = 0, all_local_out_alloc = 0;
|
||||
static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
|
||||
|
||||
static void alloc_rnk(int rnk_) {
|
||||
rnk = rnk_;
|
||||
bench_free(local_ni);
|
||||
if (rnk == 0)
|
||||
local_ni = 0;
|
||||
else
|
||||
local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
|
||||
* (8 + n_pes * 4));
|
||||
|
||||
local_starti = local_ni + rnk;
|
||||
local_no = local_ni + 2 * rnk;
|
||||
local_starto = local_ni + 3 * rnk;
|
||||
istrides = local_ni + 4 * rnk;
|
||||
ostrides = local_ni + 5 * rnk;
|
||||
total_ni = local_ni + 6 * rnk;
|
||||
total_no = local_ni + 7 * rnk;
|
||||
all_local_ni = local_ni + 8 * rnk;
|
||||
all_local_starti = local_ni + (8 + n_pes) * rnk;
|
||||
all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
|
||||
all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
|
||||
}
|
||||
|
||||
static void setup_gather_scatter(void)
|
||||
{
|
||||
int i, j;
|
||||
ptrdiff_t off;
|
||||
|
||||
MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
0, MPI_COMM_WORLD);
|
||||
MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
||||
MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
0, MPI_COMM_WORLD);
|
||||
MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
0, MPI_COMM_WORLD);
|
||||
MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
||||
MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
|
||||
0, MPI_COMM_WORLD);
|
||||
MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
||||
|
||||
off = 0;
|
||||
for (i = 0; i < n_pes; ++i) {
|
||||
ptrdiff_t N = vn;
|
||||
for (j = 0; j < rnk; ++j)
|
||||
N *= all_local_ni[i * rnk + j];
|
||||
isend_cnt[i] = N;
|
||||
isend_off[i] = off;
|
||||
off += N;
|
||||
}
|
||||
iNtot = off;
|
||||
all_local_in_alloc = 1;
|
||||
|
||||
istrides[rnk - 1] = vn;
|
||||
for (j = rnk - 2; j >= 0; --j)
|
||||
istrides[j] = total_ni[j + 1] * istrides[j + 1];
|
||||
|
||||
off = 0;
|
||||
for (i = 0; i < n_pes; ++i) {
|
||||
ptrdiff_t N = vn;
|
||||
for (j = 0; j < rnk; ++j)
|
||||
N *= all_local_no[i * rnk + j];
|
||||
orecv_cnt[i] = N;
|
||||
orecv_off[i] = off;
|
||||
off += N;
|
||||
}
|
||||
oNtot = off;
|
||||
all_local_out_alloc = 1;
|
||||
|
||||
ostrides[rnk - 1] = vn;
|
||||
for (j = rnk - 2; j >= 0; --j)
|
||||
ostrides[j] = total_no[j + 1] * ostrides[j + 1];
|
||||
}
|
||||
|
||||
static void copy_block_out(const bench_real *in,
|
||||
int rnk, ptrdiff_t *n, ptrdiff_t *start,
|
||||
ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
|
||||
bench_real *out)
|
||||
{
|
||||
ptrdiff_t i;
|
||||
if (rnk == 0) {
|
||||
for (i = 0; i < vn; ++i)
|
||||
out[i] = in[i];
|
||||
}
|
||||
else if (rnk == 1) { /* this case is just an optimization */
|
||||
ptrdiff_t j;
|
||||
out += start[0] * os[0];
|
||||
for (j = 0; j < n[0]; ++j) {
|
||||
for (i = 0; i < vn; ++i)
|
||||
out[i] = in[i];
|
||||
in += is;
|
||||
out += os[0];
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* we should do n[0] for locality, but this way is simpler to code */
|
||||
for (i = 0; i < n[rnk - 1]; ++i)
|
||||
copy_block_out(in + i * is,
|
||||
rnk - 1, n, start, is * n[rnk - 1], os, vn,
|
||||
out + (start[rnk - 1] + i) * os[rnk - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_block_in(bench_real *in,
|
||||
int rnk, ptrdiff_t *n, ptrdiff_t *start,
|
||||
ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
|
||||
const bench_real *out)
|
||||
{
|
||||
ptrdiff_t i;
|
||||
if (rnk == 0) {
|
||||
for (i = 0; i < vn; ++i)
|
||||
in[i] = out[i];
|
||||
}
|
||||
else if (rnk == 1) { /* this case is just an optimization */
|
||||
ptrdiff_t j;
|
||||
out += start[0] * os[0];
|
||||
for (j = 0; j < n[0]; ++j) {
|
||||
for (i = 0; i < vn; ++i)
|
||||
in[i] = out[i];
|
||||
in += is;
|
||||
out += os[0];
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* we should do n[0] for locality, but this way is simpler to code */
|
||||
for (i = 0; i < n[rnk - 1]; ++i)
|
||||
copy_block_in(in + i * is,
|
||||
rnk - 1, n, start, is * n[rnk - 1], os, vn,
|
||||
out + (start[rnk - 1] + i) * os[rnk - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
static void do_scatter_in(bench_real *in)
|
||||
{
|
||||
bench_real *ali;
|
||||
int i;
|
||||
if (all_local_in_alloc) {
|
||||
bench_free(all_local_in);
|
||||
all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
|
||||
all_local_in_alloc = 0;
|
||||
}
|
||||
ali = all_local_in;
|
||||
for (i = 0; i < n_pes; ++i) {
|
||||
copy_block_in(ali,
|
||||
rnk, all_local_ni + i * rnk,
|
||||
all_local_starti + i * rnk,
|
||||
vn, istrides, vn,
|
||||
in);
|
||||
ali += isend_cnt[i];
|
||||
}
|
||||
MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
|
||||
local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
|
||||
0, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
static void do_gather_out(bench_real *out)
|
||||
{
|
||||
bench_real *alo;
|
||||
int i;
|
||||
|
||||
if (all_local_out_alloc) {
|
||||
bench_free(all_local_out);
|
||||
all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
|
||||
all_local_out_alloc = 0;
|
||||
}
|
||||
MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
|
||||
all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
|
||||
0, MPI_COMM_WORLD);
|
||||
MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
|
||||
alo = all_local_out;
|
||||
for (i = 0; i < n_pes; ++i) {
|
||||
copy_block_out(alo,
|
||||
rnk, all_local_no + i * rnk,
|
||||
all_local_starto + i * rnk,
|
||||
vn, ostrides, vn,
|
||||
out);
|
||||
alo += orecv_cnt[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void alloc_local(ptrdiff_t nreal, int inplace)
|
||||
{
|
||||
bench_free(local_in);
|
||||
if (local_out != local_in) bench_free(local_out);
|
||||
local_in = local_out = 0;
|
||||
if (nreal > 0) {
|
||||
ptrdiff_t i;
|
||||
local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
|
||||
if (inplace)
|
||||
local_out = local_in;
|
||||
else
|
||||
local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
|
||||
for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
|
||||
{
|
||||
UNUSED(p);
|
||||
do_scatter_in(ri);
|
||||
if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
|
||||
}
|
||||
|
||||
void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
|
||||
{
|
||||
UNUSED(p);
|
||||
if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
|
||||
do_gather_out(ro);
|
||||
}
|
||||
|
||||
void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
|
||||
{
|
||||
UNUSED(ii);
|
||||
after_problem_rcopy_from(p, ri);
|
||||
}
|
||||
|
||||
void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
|
||||
{
|
||||
UNUSED(io);
|
||||
after_problem_rcopy_to(p, ro);
|
||||
}
|
||||
|
||||
void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
|
||||
{
|
||||
UNUSED(ii);
|
||||
after_problem_rcopy_from(p, ri);
|
||||
}
|
||||
|
||||
void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
|
||||
{
|
||||
UNUSED(io);
|
||||
after_problem_rcopy_to(p, ro);
|
||||
}
|
||||
|
||||
static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny,
|
||||
ptrdiff_t vn,
|
||||
bench_real *in, bench_real *out)
|
||||
{
|
||||
FFTW(iodim64) hdims[3];
|
||||
FFTW(r2r_kind) k[3];
|
||||
FFTW(plan) pln;
|
||||
|
||||
hdims[0].n = nx;
|
||||
hdims[0].is = ny * vn;
|
||||
hdims[0].os = vn;
|
||||
hdims[1].n = ny;
|
||||
hdims[1].is = vn;
|
||||
hdims[1].os = nx * vn;
|
||||
hdims[2].n = vn;
|
||||
hdims[2].is = 1;
|
||||
hdims[2].os = 1;
|
||||
k[0] = k[1] = k[2] = FFTW_R2HC;
|
||||
pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
|
||||
BENCH_ASSERT(pln != 0);
|
||||
return pln;
|
||||
}
|
||||
|
||||
static int tensor_rowmajor_transposedp(bench_tensor *t)
|
||||
{
|
||||
bench_iodim *d;
|
||||
int i;
|
||||
|
||||
BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
|
||||
if (t->rnk < 2)
|
||||
return 0;
|
||||
|
||||
d = t->dims;
|
||||
if (d[0].is != d[1].is * d[1].n
|
||||
|| d[0].os != d[1].is
|
||||
|| d[1].os != d[0].os * d[0].n)
|
||||
return 0;
|
||||
if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
|
||||
return 0;
|
||||
for (i = 2; i + 1 < t->rnk; ++i) {
|
||||
d = t->dims + i;
|
||||
if (d[0].is != d[1].is * d[1].n
|
||||
|| d[0].os != d[1].os * d[1].n)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tensor_contiguousp(bench_tensor *t, int s)
|
||||
{
|
||||
return (t->dims[t->rnk-1].is == s
|
||||
&& ((tensor_rowmajorp(t) &&
|
||||
t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
|
||||
|| tensor_rowmajor_transposedp(t)));
|
||||
}
|
||||
|
||||
static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
|
||||
{
|
||||
FFTW(plan) pln = 0;
|
||||
int i;
|
||||
ptrdiff_t ntot;
|
||||
|
||||
vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
|
||||
|
||||
if (p->sz->rnk < 1
|
||||
|| p->split
|
||||
|| !tensor_contiguousp(p->sz, vn)
|
||||
|| tensor_rowmajor_transposedp(p->sz)
|
||||
|| p->vecsz->rnk > 1
|
||||
|| (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|
||||
|| p->vecsz->dims[0].os != 1)))
|
||||
return 0;
|
||||
|
||||
alloc_rnk(p->sz->rnk);
|
||||
for (i = 0; i < rnk; ++i) {
|
||||
total_ni[i] = total_no[i] = p->sz->dims[i].n;
|
||||
local_ni[i] = local_no[i] = total_ni[i];
|
||||
local_starti[i] = local_starto[i] = 0;
|
||||
}
|
||||
if (rnk > 1) {
|
||||
ptrdiff_t n, start, nT, startT;
|
||||
ntot = FFTW(mpi_local_size_many_transposed)
|
||||
(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
||||
MPI_COMM_WORLD,
|
||||
&n, &start, &nT, &startT);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN) {
|
||||
local_ni[1] = nT;
|
||||
local_starti[1] = startT;
|
||||
}
|
||||
else {
|
||||
local_ni[0] = n;
|
||||
local_starti[0] = start;
|
||||
}
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT) {
|
||||
local_no[1] = nT;
|
||||
local_starto[1] = startT;
|
||||
}
|
||||
else {
|
||||
local_no[0] = n;
|
||||
local_starto[0] = start;
|
||||
}
|
||||
}
|
||||
else if (rnk == 1) {
|
||||
ntot = FFTW(mpi_local_size_many_1d)
|
||||
(total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
|
||||
local_ni, local_starti, local_no, local_starto);
|
||||
}
|
||||
alloc_local(ntot * 2, p->in == p->out);
|
||||
|
||||
pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
(FFTW(complex) *) local_in,
|
||||
(FFTW(complex) *) local_out,
|
||||
MPI_COMM_WORLD, p->sign, flags);
|
||||
|
||||
vn *= 2;
|
||||
|
||||
if (rnk > 1) {
|
||||
ptrdiff_t nrest = 1;
|
||||
for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN)
|
||||
plan_scramble_in = mkplan_transpose_local(
|
||||
p->sz->dims[0].n, local_ni[1], vn * nrest,
|
||||
local_in, local_in);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
||||
plan_unscramble_out = mkplan_transpose_local(
|
||||
local_no[1], p->sz->dims[0].n, vn * nrest,
|
||||
local_out, local_out);
|
||||
}
|
||||
|
||||
return pln;
|
||||
}
|
||||
|
||||
static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
|
||||
{
|
||||
return (t->dims[t->rnk-1].is == s
|
||||
&& ((tensor_real_rowmajorp(t, sign, 1) &&
|
||||
t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
|
||||
}
|
||||
|
||||
static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
|
||||
{
|
||||
FFTW(plan) pln = 0;
|
||||
int i;
|
||||
ptrdiff_t ntot;
|
||||
|
||||
vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
|
||||
|
||||
if (p->sz->rnk < 2
|
||||
|| p->split
|
||||
|| !tensor_real_contiguousp(p->sz, p->sign, vn)
|
||||
|| tensor_rowmajor_transposedp(p->sz)
|
||||
|| p->vecsz->rnk > 1
|
||||
|| (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|
||||
|| p->vecsz->dims[0].os != 1)))
|
||||
return 0;
|
||||
|
||||
alloc_rnk(p->sz->rnk);
|
||||
for (i = 0; i < rnk; ++i) {
|
||||
total_ni[i] = total_no[i] = p->sz->dims[i].n;
|
||||
local_ni[i] = local_no[i] = total_ni[i];
|
||||
local_starti[i] = local_starto[i] = 0;
|
||||
}
|
||||
local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1]
|
||||
= p->sz->dims[rnk-1].n / 2 + 1;
|
||||
{
|
||||
ptrdiff_t n, start, nT, startT;
|
||||
ntot = FFTW(mpi_local_size_many_transposed)
|
||||
(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
||||
MPI_COMM_WORLD,
|
||||
&n, &start, &nT, &startT);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN) {
|
||||
local_ni[1] = nT;
|
||||
local_starti[1] = startT;
|
||||
}
|
||||
else {
|
||||
local_ni[0] = n;
|
||||
local_starti[0] = start;
|
||||
}
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT) {
|
||||
local_no[1] = nT;
|
||||
local_starto[1] = startT;
|
||||
}
|
||||
else {
|
||||
local_no[0] = n;
|
||||
local_starto[0] = start;
|
||||
}
|
||||
}
|
||||
alloc_local(ntot * 2, p->in == p->out);
|
||||
|
||||
total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
|
||||
if (p->sign < 0)
|
||||
pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
local_in,
|
||||
(FFTW(complex) *) local_out,
|
||||
MPI_COMM_WORLD, flags);
|
||||
else
|
||||
pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
(FFTW(complex) *) local_in,
|
||||
local_out,
|
||||
MPI_COMM_WORLD, flags);
|
||||
|
||||
total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
|
||||
vn *= 2;
|
||||
|
||||
{
|
||||
ptrdiff_t nrest = 1;
|
||||
for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN)
|
||||
plan_scramble_in = mkplan_transpose_local(
|
||||
total_ni[0], local_ni[1], vn * nrest,
|
||||
local_in, local_in);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
||||
plan_unscramble_out = mkplan_transpose_local(
|
||||
local_no[1], total_ni[0], vn * nrest,
|
||||
local_out, local_out);
|
||||
}
|
||||
|
||||
return pln;
|
||||
}
|
||||
|
||||
static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
|
||||
{
|
||||
ptrdiff_t ntot, nx, ny;
|
||||
int ix=0, iy=1, i;
|
||||
const bench_iodim *d = p->vecsz->dims;
|
||||
FFTW(plan) pln;
|
||||
|
||||
if (p->vecsz->rnk == 3) {
|
||||
for (i = 0; i < 3; ++i)
|
||||
if (d[i].is == 1 && d[i].os == 1) {
|
||||
vn = d[i].n;
|
||||
ix = (i + 1) % 3;
|
||||
iy = (i + 2) % 3;
|
||||
break;
|
||||
}
|
||||
if (i == 3) return 0;
|
||||
}
|
||||
else {
|
||||
vn = 1;
|
||||
ix = 0;
|
||||
iy = 1;
|
||||
}
|
||||
|
||||
if (d[ix].is == d[iy].n * vn && d[ix].os == vn
|
||||
&& d[iy].os == d[ix].n * vn && d[iy].is == vn) {
|
||||
nx = d[ix].n;
|
||||
ny = d[iy].n;
|
||||
}
|
||||
else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
|
||||
&& d[ix].os == d[iy].n * vn && d[ix].is == vn) {
|
||||
nx = d[iy].n;
|
||||
ny = d[ix].n;
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
|
||||
alloc_rnk(2);
|
||||
ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
|
||||
&local_ni[0],
|
||||
&local_starti[0],
|
||||
&local_no[0],
|
||||
&local_starto[0]);
|
||||
local_ni[1] = ny;
|
||||
local_starti[1] = 0;
|
||||
local_no[1] = nx;
|
||||
local_starto[1] = 0;
|
||||
total_ni[0] = nx; total_ni[1] = ny;
|
||||
total_no[1] = nx; total_no[0] = ny;
|
||||
alloc_local(ntot, p->in == p->out);
|
||||
|
||||
pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
local_in, local_out,
|
||||
MPI_COMM_WORLD, flags);
|
||||
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN)
|
||||
plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
|
||||
local_in, local_in);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
||||
plan_unscramble_out = mkplan_transpose_local
|
||||
(nx, local_no[0], vn, local_out, local_out);
|
||||
|
||||
#if 0
|
||||
if (pln && vn == 1) {
|
||||
int i, j;
|
||||
bench_real *ri = (bench_real *) p->in;
|
||||
bench_real *ro = (bench_real *) p->out;
|
||||
if (!ri || !ro) return pln;
|
||||
setup_gather_scatter();
|
||||
for (i = 0; i < nx * ny; ++i)
|
||||
ri[i] = i;
|
||||
after_problem_rcopy_from(p, ri);
|
||||
FFTW(execute)(pln);
|
||||
after_problem_rcopy_to(p, ro);
|
||||
if (my_pe == 0) {
|
||||
for (i = 0; i < nx; ++i) {
|
||||
for (j = 0; j < ny; ++j)
|
||||
printf(" %3g", ro[j * nx + i]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return pln;
|
||||
}
|
||||
|
||||
static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
|
||||
{
|
||||
FFTW(plan) pln = 0;
|
||||
int i;
|
||||
ptrdiff_t ntot;
|
||||
FFTW(r2r_kind) *k;
|
||||
|
||||
if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
|
||||
&& p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
|
||||
return mkplan_transpose(p, flags);
|
||||
|
||||
vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
|
||||
|
||||
if (p->sz->rnk < 1
|
||||
|| p->split
|
||||
|| !tensor_contiguousp(p->sz, vn)
|
||||
|| tensor_rowmajor_transposedp(p->sz)
|
||||
|| p->vecsz->rnk > 1
|
||||
|| (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|
||||
|| p->vecsz->dims[0].os != 1)))
|
||||
return 0;
|
||||
|
||||
alloc_rnk(p->sz->rnk);
|
||||
for (i = 0; i < rnk; ++i) {
|
||||
total_ni[i] = total_no[i] = p->sz->dims[i].n;
|
||||
local_ni[i] = local_no[i] = total_ni[i];
|
||||
local_starti[i] = local_starto[i] = 0;
|
||||
}
|
||||
if (rnk > 1) {
|
||||
ptrdiff_t n, start, nT, startT;
|
||||
ntot = FFTW(mpi_local_size_many_transposed)
|
||||
(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
||||
MPI_COMM_WORLD,
|
||||
&n, &start, &nT, &startT);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN) {
|
||||
local_ni[1] = nT;
|
||||
local_starti[1] = startT;
|
||||
}
|
||||
else {
|
||||
local_ni[0] = n;
|
||||
local_starti[0] = start;
|
||||
}
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT) {
|
||||
local_no[1] = nT;
|
||||
local_starto[1] = startT;
|
||||
}
|
||||
else {
|
||||
local_no[0] = n;
|
||||
local_starto[0] = start;
|
||||
}
|
||||
}
|
||||
else if (rnk == 1) {
|
||||
ntot = FFTW(mpi_local_size_many_1d)
|
||||
(total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
|
||||
local_ni, local_starti, local_no, local_starto);
|
||||
}
|
||||
alloc_local(ntot, p->in == p->out);
|
||||
|
||||
k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
|
||||
for (i = 0; i < p->sz->rnk; ++i)
|
||||
switch (p->k[i]) {
|
||||
case R2R_R2HC: k[i] = FFTW_R2HC; break;
|
||||
case R2R_HC2R: k[i] = FFTW_HC2R; break;
|
||||
case R2R_DHT: k[i] = FFTW_DHT; break;
|
||||
case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
|
||||
case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
|
||||
case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
|
||||
case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
|
||||
case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
|
||||
case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
|
||||
case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
|
||||
case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
|
||||
default: BENCH_ASSERT(0);
|
||||
}
|
||||
|
||||
pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
FFTW_MPI_DEFAULT_BLOCK,
|
||||
local_in, local_out,
|
||||
MPI_COMM_WORLD, k, flags);
|
||||
bench_free(k);
|
||||
|
||||
if (rnk > 1) {
|
||||
ptrdiff_t nrest = 1;
|
||||
for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
|
||||
if (flags & FFTW_MPI_TRANSPOSED_IN)
|
||||
plan_scramble_in = mkplan_transpose_local(
|
||||
p->sz->dims[0].n, local_ni[1], vn * nrest,
|
||||
local_in, local_in);
|
||||
if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
||||
plan_unscramble_out = mkplan_transpose_local(
|
||||
local_no[1], p->sz->dims[0].n, vn * nrest,
|
||||
local_out, local_out);
|
||||
}
|
||||
|
||||
return pln;
|
||||
}
|
||||
|
||||
FFTW(plan) mkplan(bench_problem *p, unsigned flags)
|
||||
{
|
||||
FFTW(plan) pln = 0;
|
||||
FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
|
||||
FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
|
||||
if (p->scrambled_in) {
|
||||
if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
|
||||
flags |= FFTW_MPI_SCRAMBLED_IN;
|
||||
else
|
||||
flags |= FFTW_MPI_TRANSPOSED_IN;
|
||||
}
|
||||
if (p->scrambled_out) {
|
||||
if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
|
||||
flags |= FFTW_MPI_SCRAMBLED_OUT;
|
||||
else
|
||||
flags |= FFTW_MPI_TRANSPOSED_OUT;
|
||||
}
|
||||
switch (p->kind) {
|
||||
case PROBLEM_COMPLEX:
|
||||
pln =mkplan_complex(p, flags);
|
||||
break;
|
||||
case PROBLEM_REAL:
|
||||
pln = mkplan_real(p, flags);
|
||||
break;
|
||||
case PROBLEM_R2R:
|
||||
pln = mkplan_r2r(p, flags);
|
||||
break;
|
||||
default: BENCH_ASSERT(0);
|
||||
}
|
||||
if (pln) setup_gather_scatter();
|
||||
return pln;
|
||||
}
|
||||
|
||||
void main_init(int *argc, char ***argv)
|
||||
{
|
||||
#ifdef HAVE_SMP
|
||||
# if MPI_VERSION >= 2 /* for MPI_Init_thread */
|
||||
int provided;
|
||||
MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
|
||||
threads_ok = provided >= MPI_THREAD_FUNNELED;
|
||||
# else
|
||||
MPI_Init(argc, argv);
|
||||
threads_ok = 0;
|
||||
# endif
|
||||
#else
|
||||
MPI_Init(argc, argv);
|
||||
#endif
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
|
||||
if (my_pe != 0) verbose = -999;
|
||||
no_speed_allocation = 1; /* so we can benchmark transforms > memory */
|
||||
always_pad_real = 1; /* out-of-place real transforms are padded */
|
||||
isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
|
||||
isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
|
||||
orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
|
||||
orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
|
||||
|
||||
/* init_threads must be called before any other FFTW function,
|
||||
including mpi_init, because it has to register the threads hooks
|
||||
before the planner is initalized */
|
||||
#ifdef HAVE_SMP
|
||||
if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
|
||||
#endif
|
||||
FFTW(mpi_init)();
|
||||
}
|
||||
|
||||
void initial_cleanup(void)
|
||||
{
|
||||
alloc_rnk(0);
|
||||
alloc_local(0, 0);
|
||||
bench_free(all_local_in); all_local_in = 0;
|
||||
bench_free(all_local_out); all_local_out = 0;
|
||||
bench_free(isend_off); isend_off = 0;
|
||||
bench_free(isend_cnt); isend_cnt = 0;
|
||||
bench_free(orecv_off); orecv_off = 0;
|
||||
bench_free(orecv_cnt); orecv_cnt = 0;
|
||||
FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
|
||||
FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
|
||||
}
|
||||
|
||||
void final_cleanup(void)
|
||||
{
|
||||
MPI_Finalize();
|
||||
}
|
||||
|
||||
void bench_exit(int status)
|
||||
{
|
||||
MPI_Abort(MPI_COMM_WORLD, status);
|
||||
}
|
||||
|
||||
double bench_cost_postprocess(double cost)
|
||||
{
|
||||
double cost_max;
|
||||
MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
return cost_max;
|
||||
}
|
||||
|
||||
|
||||
int import_wisdom(FILE *f)
|
||||
{
|
||||
int success = 1, sall;
|
||||
if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
|
||||
FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
|
||||
return sall;
|
||||
}
|
||||
|
||||
void export_wisdom(FILE *f)
|
||||
{
|
||||
FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
|
||||
if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
|
||||
}
|
||||
59
fftw-3.3.10/mpi/mpi-dft.h
Normal file
59
fftw-3.3.10/mpi/mpi-dft.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* problem.c: */
|
||||
typedef struct {
|
||||
problem super;
|
||||
dtensor *sz;
|
||||
INT vn; /* vector length (vector stride 1) */
|
||||
R *I, *O; /* contiguous interleaved arrays */
|
||||
|
||||
int sign; /* FFTW_FORWARD / FFTW_BACKWARD */
|
||||
unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
|
||||
SCRAMBLED_IN/OUT meaningful for 1d transforms only */
|
||||
|
||||
MPI_Comm comm;
|
||||
} problem_mpi_dft;
|
||||
|
||||
problem *XM(mkproblem_dft)(const dtensor *sz, INT vn,
|
||||
R *I, R *O, MPI_Comm comm,
|
||||
int sign, unsigned flags);
|
||||
problem *XM(mkproblem_dft_d)(dtensor *sz, INT vn,
|
||||
R *I, R *O, MPI_Comm comm,
|
||||
int sign, unsigned flags);
|
||||
|
||||
/* solve.c: */
|
||||
void XM(dft_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plans have same operands as rdft plans, so just re-use */
|
||||
typedef plan_rdft plan_mpi_dft;
|
||||
#define MKPLAN_MPI_DFT(type, adt, apply) \
|
||||
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
|
||||
|
||||
int XM(dft_serial_applicable)(const problem_mpi_dft *p);
|
||||
|
||||
/* various solvers */
|
||||
void XM(dft_rank_geq2_register)(planner *p);
|
||||
void XM(dft_rank_geq2_transposed_register)(planner *p);
|
||||
void XM(dft_serial_register)(planner *p);
|
||||
void XM(dft_rank1_bigvec_register)(planner *p);
|
||||
void XM(dft_rank1_register)(planner *p);
|
||||
66
fftw-3.3.10/mpi/mpi-rdft.h
Normal file
66
fftw-3.3.10/mpi/mpi-rdft.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* problem.c: */
|
||||
typedef struct {
|
||||
problem super;
|
||||
dtensor *sz;
|
||||
INT vn; /* vector length (vector stride 1) */
|
||||
R *I, *O; /* contiguous interleaved arrays */
|
||||
|
||||
|
||||
unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
|
||||
SCRAMBLED_IN/OUT meaningful for 1d transforms only */
|
||||
|
||||
MPI_Comm comm;
|
||||
|
||||
#if defined(STRUCT_HACK_KR)
|
||||
rdft_kind kind[1];
|
||||
#elif defined(STRUCT_HACK_C99)
|
||||
rdft_kind kind[];
|
||||
#else
|
||||
rdft_kind *kind;
|
||||
#endif
|
||||
} problem_mpi_rdft;
|
||||
|
||||
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
|
||||
R *I, R *O, MPI_Comm comm,
|
||||
const rdft_kind *kind, unsigned flags);
|
||||
problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
|
||||
R *I, R *O, MPI_Comm comm,
|
||||
const rdft_kind *kind, unsigned flags);
|
||||
|
||||
/* solve.c: */
|
||||
void XM(rdft_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plans have same operands as rdft plans, so just re-use */
|
||||
typedef plan_rdft plan_mpi_rdft;
|
||||
#define MKPLAN_MPI_RDFT(type, adt, apply) \
|
||||
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
|
||||
|
||||
int XM(rdft_serial_applicable)(const problem_mpi_rdft *p);
|
||||
|
||||
/* various solvers */
|
||||
void XM(rdft_rank_geq2_register)(planner *p);
|
||||
void XM(rdft_rank_geq2_transposed_register)(planner *p);
|
||||
void XM(rdft_serial_register)(planner *p);
|
||||
void XM(rdft_rank1_bigvec_register)(planner *p);
|
||||
64
fftw-3.3.10/mpi/mpi-rdft2.h
Normal file
64
fftw-3.3.10/mpi/mpi-rdft2.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* r2c and c2r transforms. The sz dtensor, as usual, gives the size
|
||||
of the "logical" complex array. For the last dimension N, however,
|
||||
only N/2+1 complex numbers are stored for the complex data. Moreover,
|
||||
for the real data, the last dimension is *always* padded to a size
|
||||
2*(N/2+1). (Contrast this with the serial API, where there is only
|
||||
padding for in-place plans.) */
|
||||
|
||||
/* problem.c: */
|
||||
typedef struct {
|
||||
problem super;
|
||||
dtensor *sz;
|
||||
INT vn; /* vector length (vector stride 1) */
|
||||
R *I, *O; /* contiguous interleaved arrays */
|
||||
|
||||
rdft_kind kind; /* assert(kind < DHT) */
|
||||
unsigned flags; /* TRANSPOSED_IN/OUT meaningful for rnk>1 only
|
||||
SCRAMBLED_IN/OUT meaningful for 1d transforms only */
|
||||
|
||||
MPI_Comm comm;
|
||||
} problem_mpi_rdft2;
|
||||
|
||||
problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
|
||||
R *I, R *O, MPI_Comm comm,
|
||||
rdft_kind kind, unsigned flags);
|
||||
problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
|
||||
R *I, R *O, MPI_Comm comm,
|
||||
rdft_kind kind, unsigned flags);
|
||||
|
||||
/* solve.c: */
|
||||
void XM(rdft2_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plans have same operands as rdft plans, so just re-use */
|
||||
typedef plan_rdft plan_mpi_rdft2;
|
||||
#define MKPLAN_MPI_RDFT2(type, adt, apply) \
|
||||
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
|
||||
|
||||
int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p);
|
||||
|
||||
/* various solvers */
|
||||
void XM(rdft2_rank_geq2_register)(planner *p);
|
||||
void XM(rdft2_rank_geq2_transposed_register)(planner *p);
|
||||
void XM(rdft2_serial_register)(planner *p);
|
||||
61
fftw-3.3.10/mpi/mpi-transpose.h
Normal file
61
fftw-3.3.10/mpi/mpi-transpose.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* tproblem.c: */
|
||||
typedef struct {
|
||||
problem super;
|
||||
INT vn; /* vector length (vector stride 1) */
|
||||
INT nx, ny; /* nx x ny transposed to ny x nx */
|
||||
R *I, *O; /* contiguous real arrays (both same size!) */
|
||||
|
||||
unsigned flags; /* TRANSPOSED_IN: input is *locally* transposed
|
||||
TRANSPOSED_OUT: output is *locally* transposed */
|
||||
|
||||
INT block, tblock; /* block size, slab decomposition;
|
||||
tblock is for transposed blocks on output */
|
||||
|
||||
MPI_Comm comm;
|
||||
} problem_mpi_transpose;
|
||||
|
||||
problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
|
||||
R *I, R *O,
|
||||
INT block, INT tblock,
|
||||
MPI_Comm comm,
|
||||
unsigned flags);
|
||||
|
||||
/* tsolve.c: */
|
||||
void XM(transpose_solve)(const plan *ego_, const problem *p_);
|
||||
|
||||
/* plans have same operands as rdft plans, so just re-use */
|
||||
typedef plan_rdft plan_mpi_transpose;
|
||||
#define MKPLAN_MPI_TRANSPOSE(type, adt, apply) \
|
||||
(type *)X(mkplan_rdft)(sizeof(type), adt, apply)
|
||||
|
||||
/* transpose-pairwise.c: */
|
||||
int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
|
||||
R *I, R *O, int my_pe,
|
||||
plan **cld2, plan **cld2rest, plan **cld3,
|
||||
INT *rest_Ioff, INT *rest_Ooff);
|
||||
/* various solvers */
|
||||
void XM(transpose_pairwise_register)(planner *p);
|
||||
void XM(transpose_alltoall_register)(planner *p);
|
||||
void XM(transpose_recurse_register)(planner *p);
|
||||
155
fftw-3.3.10/mpi/rdft-problem.c
Normal file
155
fftw-3.3.10/mpi/rdft-problem.c
Normal file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-rdft.h"
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_mpi_rdft *ego = (problem_mpi_rdft *) ego_;
|
||||
XM(dtensor_destroy)(ego->sz);
|
||||
MPI_Comm_free(&ego->comm);
|
||||
#if !defined(STRUCT_HACK_C99) && !defined(STRUCT_HACK_KR)
|
||||
X(ifree0)(ego->kind);
|
||||
#endif
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
|
||||
int i;
|
||||
X(md5puts)(m, "mpi-dft");
|
||||
X(md5int)(m, p->I == p->O);
|
||||
/* don't include alignment -- may differ between processes
|
||||
X(md5int)(m, X(ialignment_of)(p->I));
|
||||
X(md5int)(m, X(ialignment_of)(p->O));
|
||||
... note that applicability of MPI plans does not depend
|
||||
on alignment (although optimality may, in principle). */
|
||||
XM(dtensor_md5)(m, p->sz);
|
||||
X(md5INT)(m, p->vn);
|
||||
for (i = 0; i < p->sz->rnk; ++i)
|
||||
X(md5int)(m, p->kind[i]);
|
||||
X(md5int)(m, p->flags);
|
||||
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
|
||||
A(XM(md5_equal)(*m, p->comm));
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
|
||||
int i;
|
||||
p->print(p, "(mpi-rdft %d %d %d ",
|
||||
ego->I == ego->O,
|
||||
X(ialignment_of)(ego->I),
|
||||
X(ialignment_of)(ego->O));
|
||||
XM(dtensor_print)(ego->sz, p);
|
||||
for (i = 0; i < ego->sz->rnk; ++i)
|
||||
p->print(p, " %d", (int)ego->kind[i]);
|
||||
p->print(p, " %D %d", ego->vn, ego->flags);
|
||||
MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_mpi_rdft *ego = (const problem_mpi_rdft *) ego_;
|
||||
R *I = ego->I;
|
||||
INT i, N;
|
||||
int my_pe;
|
||||
|
||||
MPI_Comm_rank(ego->comm, &my_pe);
|
||||
N = ego->vn * XM(total_block)(ego->sz, IB, my_pe);
|
||||
for (i = 0; i < N; ++i) I[i] = K(0.0);
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_MPI_RDFT,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
problem *XM(mkproblem_rdft)(const dtensor *sz, INT vn,
|
||||
R *I, R *O,
|
||||
MPI_Comm comm,
|
||||
const rdft_kind *kind, unsigned flags)
|
||||
{
|
||||
problem_mpi_rdft *ego;
|
||||
int i, rnk = sz->rnk;
|
||||
int n_pes;
|
||||
|
||||
A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk));
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
A(n_pes >= XM(num_blocks_total)(sz, IB)
|
||||
&& n_pes >= XM(num_blocks_total)(sz, OB));
|
||||
A(vn >= 0);
|
||||
|
||||
#if defined(STRUCT_HACK_KR)
|
||||
ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
|
||||
+ sizeof(rdft_kind)
|
||||
* (rnk > 0 ? rnk - 1 : 0), &padt);
|
||||
#elif defined(STRUCT_HACK_C99)
|
||||
ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft)
|
||||
+ sizeof(rdft_kind) * rnk, &padt);
|
||||
#else
|
||||
ego = (problem_mpi_rdft *) X(mkproblem)(sizeof(problem_mpi_rdft), &padt);
|
||||
ego->kind = (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
|
||||
#endif
|
||||
|
||||
/* enforce pointer equality if untainted pointers are equal */
|
||||
if (UNTAINT(I) == UNTAINT(O))
|
||||
I = O = JOIN_TAINT(I, O);
|
||||
|
||||
ego->sz = XM(dtensor_canonical)(sz, 0);
|
||||
ego->vn = vn;
|
||||
ego->I = I;
|
||||
ego->O = O;
|
||||
for (i = 0; i< ego->sz->rnk; ++i)
|
||||
ego->kind[i] = kind[i];
|
||||
|
||||
/* canonicalize: replace TRANSPOSED_IN with TRANSPOSED_OUT by
|
||||
swapping the first two dimensions (for rnk > 1) */
|
||||
if ((flags & TRANSPOSED_IN) && ego->sz->rnk > 1) {
|
||||
rdft_kind k = ego->kind[0];
|
||||
ddim dim0 = ego->sz->dims[0];
|
||||
ego->sz->dims[0] = ego->sz->dims[1];
|
||||
ego->sz->dims[1] = dim0;
|
||||
ego->kind[0] = ego->kind[1];
|
||||
ego->kind[1] = k;
|
||||
flags &= ~TRANSPOSED_IN;
|
||||
flags ^= TRANSPOSED_OUT;
|
||||
}
|
||||
ego->flags = flags;
|
||||
|
||||
MPI_Comm_dup(comm, &ego->comm);
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
|
||||
problem *XM(mkproblem_rdft_d)(dtensor *sz, INT vn,
|
||||
R *I, R *O,
|
||||
MPI_Comm comm,
|
||||
const rdft_kind *kind, unsigned flags)
|
||||
{
|
||||
problem *p = XM(mkproblem_rdft)(sz, vn, I, O, comm, kind, flags);
|
||||
XM(dtensor_destroy)(sz);
|
||||
return p;
|
||||
}
|
||||
211
fftw-3.3.10/mpi/rdft-rank-geq2-transposed.c
Normal file
211
fftw-3.3.10/mpi/rdft-rank-geq2-transposed.c
Normal file
@@ -0,0 +1,211 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex RDFTs of rank >= 2, for the case where we are distributed
|
||||
across the first dimension only, and the output is transposed both
|
||||
in data distribution and in ordering (for the first 2 dimensions).
|
||||
|
||||
(Note that we don't have to handle the case where the input is
|
||||
transposed, since this is equivalent to transposed output with the
|
||||
first two dimensions swapped, and is automatically canonicalized as
|
||||
such by rdft-problem.c. */
|
||||
|
||||
#include "mpi-rdft.h"
|
||||
#include "mpi-transpose.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft super;
|
||||
|
||||
plan *cld1, *cldt, *cld2;
|
||||
INT roff, ioff;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld1, *cld2, *cldt;
|
||||
|
||||
/* RDFT local dimensions */
|
||||
cld1 = (plan_rdft *) ego->cld1;
|
||||
if (ego->preserve_input) {
|
||||
cld1->apply(ego->cld1, I, O);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld1->apply(ego->cld1, I, I);
|
||||
|
||||
/* global transpose */
|
||||
cldt = (plan_rdft *) ego->cldt;
|
||||
cldt->apply(ego->cldt, I, O);
|
||||
|
||||
/* RDFT final local dimension */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, O, O);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk > 1
|
||||
&& p->flags == TRANSPOSED_OUT
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& XM(is_local_after)(1, p->sz, IB)
|
||||
&& XM(is_local_after)(2, p->sz, OB)
|
||||
&& XM(num_blocks)(p->sz->dims[0].n, p->sz->dims[0].b[OB]) == 1
|
||||
&& (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
|
||||
|| !XM(rdft_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cldt, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cldt);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-rdft-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"",
|
||||
ego->cld1, ego->cldt, ego->cld2);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_rdft *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cldt = 0, *cld2 = 0;
|
||||
R *I, *O, *I2;
|
||||
tensor *sz;
|
||||
int i, my_pe, n_pes;
|
||||
INT nrest;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_rdft *) p_;
|
||||
|
||||
I2 = I = p->I;
|
||||
O = p->O;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
|
||||
I = O;
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
|
||||
i = p->sz->rnk - 2; A(i >= 0);
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = p->vn;
|
||||
for (--i; i >= 0; --i) {
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
|
||||
}
|
||||
nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
|
||||
{
|
||||
INT is = sz->dims[0].n * sz->dims[0].is;
|
||||
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(sz,
|
||||
X(mktensor_2d)(b, is, is,
|
||||
p->vn, 1, 1),
|
||||
I2, I, p->kind + 1));
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
nrest *= p->vn;
|
||||
cldt = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
p->sz->dims[0].n, p->sz->dims[1].n, nrest,
|
||||
I, O,
|
||||
p->sz->dims[0].b[IB], p->sz->dims[1].b[OB],
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt, p->comm)) goto nada;
|
||||
|
||||
{
|
||||
INT is = p->sz->dims[0].n * nrest;
|
||||
INT b = XM(block)(p->sz->dims[1].n, p->sz->dims[1].b[OB], my_pe);
|
||||
cld2 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_1_d)(X(mktensor_1d)(
|
||||
p->sz->dims[0].n,
|
||||
nrest, nrest),
|
||||
X(mktensor_2d)(b, is, is,
|
||||
nrest, 1, 1),
|
||||
O, O, p->kind[0]));
|
||||
if (XM(any_true)(!cld2, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
|
||||
pln->cld1 = cld1;
|
||||
pln->cldt = cldt;
|
||||
pln->cld2 = cld2;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
X(ops_add2)(&cldt->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cldt);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(rdft_rank_geq2_transposed_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
179
fftw-3.3.10/mpi/rdft-rank-geq2.c
Normal file
179
fftw-3.3.10/mpi/rdft-rank-geq2.c
Normal file
@@ -0,0 +1,179 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex RDFTs of rank >= 2, for the case where we are distributed
|
||||
across the first dimension only, and the output is not transposed. */
|
||||
|
||||
#include "mpi-rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld1, *cld2;
|
||||
|
||||
/* RDFT local dimensions */
|
||||
cld1 = (plan_rdft *) ego->cld1;
|
||||
if (ego->preserve_input) {
|
||||
cld1->apply(ego->cld1, I, O);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld1->apply(ego->cld1, I, I);
|
||||
|
||||
/* RDFT non-local dimension (via rdft-rank1-bigvec, usually): */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, I, O);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk > 1
|
||||
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& XM(is_local_after)(1, p->sz, IB)
|
||||
&& XM(is_local_after)(1, p->sz, OB)
|
||||
&& (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
|
||||
|| !XM(rdft_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-rdft-rank-geq2%s%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_rdft *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0;
|
||||
R *I, *O, *I2;
|
||||
tensor *sz;
|
||||
dtensor *sz2;
|
||||
int i, my_pe, n_pes;
|
||||
INT nrest;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_rdft *) p_;
|
||||
|
||||
I2 = I = p->I;
|
||||
O = p->O;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr))
|
||||
I = O;
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
|
||||
i = p->sz->rnk - 2; A(i >= 0);
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = p->vn;
|
||||
for (--i; i >= 0; --i) {
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
|
||||
}
|
||||
nrest = X(tensor_sz)(sz);
|
||||
{
|
||||
INT is = sz->dims[0].n * sz->dims[0].is;
|
||||
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(sz,
|
||||
X(mktensor_2d)(b, is, is,
|
||||
p->vn, 1, 1),
|
||||
I2, I, p->kind + 1));
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
|
||||
sz2->dims[0] = p->sz->dims[0];
|
||||
cld2 = X(mkplan_d)(plnr, XM(mkproblem_rdft_d)(sz2, nrest * p->vn,
|
||||
I, O,
|
||||
p->comm, p->kind,
|
||||
RANK1_BIGVEC_ONLY));
|
||||
if (XM(any_true)(!cld2, p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(rdft_rank_geq2_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
205
fftw-3.3.10/mpi/rdft-rank1-bigvec.c
Normal file
205
fftw-3.3.10/mpi/rdft-rank1-bigvec.c
Normal file
@@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex RDFTs of rank == 1 when the vector length vn is >= # processes.
|
||||
In this case, we don't need to use a six-step type algorithm, and can
|
||||
instead transpose the RDFT dimension with the vector dimension to
|
||||
make the RDFT local. */
|
||||
|
||||
#include "mpi-rdft.h"
|
||||
#include "mpi-transpose.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
rearrangement rearrange;
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft super;
|
||||
|
||||
plan *cldt_before, *cld, *cldt_after;
|
||||
int preserve_input;
|
||||
rearrangement rearrange;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld, *cldt_before, *cldt_after;
|
||||
|
||||
/* global transpose */
|
||||
cldt_before = (plan_rdft *) ego->cldt_before;
|
||||
cldt_before->apply(ego->cldt_before, I, O);
|
||||
|
||||
if (ego->preserve_input) I = O;
|
||||
|
||||
/* 1d RDFT(s) */
|
||||
cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, O, I);
|
||||
|
||||
/* global transpose */
|
||||
cldt_after = (plan_rdft *) ego->cldt_after;
|
||||
cldt_after->apply(ego->cldt_after, I, O);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
|
||||
int n_pes;
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
return (1
|
||||
&& p->sz->rnk == 1
|
||||
&& !(p->flags & ~RANK1_BIGVEC_ONLY)
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
|
||||
#if 0 /* don't need this check since no other rank-1 rdft solver */
|
||||
&& (p->vn >= n_pes /* TODO: relax this, using more memory? */
|
||||
|| (p->flags & RANK1_BIGVEC_ONLY))
|
||||
#endif
|
||||
|
||||
&& XM(rearrange_applicable)(ego->rearrange,
|
||||
p->sz->dims[0], p->vn, n_pes)
|
||||
|
||||
&& (!NO_SLOWP(plnr) /* slow if rdft-serial is applicable */
|
||||
|| !XM(rdft_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cldt_before, wakefulness);
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
X(plan_awake)(ego->cldt_after, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldt_after);
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
X(plan_destroy_internal)(ego->cldt_before);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
const char descrip[][16] = { "contig", "discontig", "square-after",
|
||||
"square-middle", "square-before" };
|
||||
p->print(p, "(mpi-rdft-rank1-bigvec/%s%s %(%p%) %(%p%) %(%p%))",
|
||||
descrip[ego->rearrange], ego->preserve_input==2 ?"/p":"",
|
||||
ego->cldt_before, ego->cld, ego->cldt_after);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_rdft *p;
|
||||
P *pln;
|
||||
plan *cld = 0, *cldt_before = 0, *cldt_after = 0;
|
||||
R *I, *O;
|
||||
INT yblock, yb, nx, ny, vn;
|
||||
int my_pe, n_pes;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_rdft *) p_;
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
nx = p->sz->dims[0].n;
|
||||
if (!(ny = XM(rearrange_ny)(ego->rearrange, p->sz->dims[0],p->vn,n_pes)))
|
||||
return (plan *) 0;
|
||||
vn = p->vn / ny;
|
||||
A(ny * vn == p->vn);
|
||||
|
||||
yblock = XM(default_block)(ny, n_pes);
|
||||
cldt_before = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
nx, ny, vn,
|
||||
I = p->I, O = p->O,
|
||||
p->sz->dims[0].b[IB], yblock,
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt_before, p->comm)) goto nada;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) { I = O; }
|
||||
|
||||
yb = XM(block)(ny, yblock, my_pe);
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_1_d)(X(mktensor_1d)(nx, vn, vn),
|
||||
X(mktensor_2d)(yb, vn*nx, vn*nx,
|
||||
vn, 1, 1),
|
||||
O, I, p->kind[0]));
|
||||
if (XM(any_true)(!cld, p->comm)) goto nada;
|
||||
|
||||
cldt_after = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
ny, nx, vn,
|
||||
I, O,
|
||||
yblock, p->sz->dims[0].b[OB],
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt_after, p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
|
||||
|
||||
pln->cldt_before = cldt_before;
|
||||
pln->cld = cld;
|
||||
pln->cldt_after = cldt_after;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->rearrange = ego->rearrange;
|
||||
|
||||
X(ops_add)(&cldt_before->ops, &cld->ops, &pln->super.super.ops);
|
||||
X(ops_add2)(&cldt_after->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldt_after);
|
||||
X(plan_destroy_internal)(cld);
|
||||
X(plan_destroy_internal)(cldt_before);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(rearrangement rearrange, int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->rearrange = rearrange;
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(rdft_rank1_bigvec_register)(planner *p)
|
||||
{
|
||||
rearrangement rearrange;
|
||||
int preserve_input;
|
||||
FORALL_REARRANGE(rearrange)
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(rearrange, preserve_input));
|
||||
}
|
||||
124
fftw-3.3.10/mpi/rdft-serial.c
Normal file
124
fftw-3.3.10/mpi/rdft-serial.c
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* "MPI" RDFTs where all of the data is on one processor...just
|
||||
call through to serial API. */
|
||||
|
||||
#include "mpi-rdft.h"
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft super;
|
||||
plan *cld;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld = (plan_rdft *) ego->cld;
|
||||
cld->apply(ego->cld, I, O);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-rdft-serial %(%p%))", ego->cld);
|
||||
}
|
||||
|
||||
int XM(rdft_serial_applicable)(const problem_mpi_rdft *p)
|
||||
{
|
||||
return (1
|
||||
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
|
||||
&& ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
|
||||
|| p->vn == 0));
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
|
||||
P *pln;
|
||||
plan *cld;
|
||||
int my_pe;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
/* check whether applicable: */
|
||||
if (!XM(rdft_serial_applicable)(p))
|
||||
return (plan *) 0;
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
if (my_pe == 0 && p->vn > 0) {
|
||||
int i, rnk = p->sz->rnk;
|
||||
tensor *sz = X(mktensor)(rnk);
|
||||
rdft_kind *kind
|
||||
= (rdft_kind *) MALLOC(sizeof(rdft_kind) * rnk, PROBLEMS);
|
||||
sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = p->vn;
|
||||
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
|
||||
for (i = rnk - 1; i > 0; --i) {
|
||||
sz->dims[i - 1].is = sz->dims[i - 1].os =
|
||||
sz->dims[i].is * sz->dims[i].n;
|
||||
sz->dims[i - 1].n = p->sz->dims[i - 1].n;
|
||||
}
|
||||
for (i = 0; i < rnk; ++i)
|
||||
kind[i] = p->kind[i];
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_d)(sz,
|
||||
X(mktensor_1d)(p->vn, 1, 1),
|
||||
p->I, p->O, kind));
|
||||
X(ifree0)(kind);
|
||||
}
|
||||
else { /* idle process: make nop plan */
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_1d)(0,0,0),
|
||||
p->I, p->O));
|
||||
}
|
||||
if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
|
||||
|
||||
pln = MKPLAN_MPI_RDFT(P, &padt, apply);
|
||||
pln->cld = cld;
|
||||
X(ops_cpy)(&cld->ops, &pln->super.super.ops);
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT, mkplan, 0 };
|
||||
return MKSOLVER(solver, &sadt);
|
||||
}
|
||||
|
||||
void XM(rdft_serial_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
29
fftw-3.3.10/mpi/rdft-solve.c
Normal file
29
fftw-3.3.10/mpi/rdft-solve.c
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-rdft.h"
|
||||
|
||||
/* use the apply() operation for MPI_RDFT problems */
|
||||
void XM(rdft_solve)(const plan *ego_, const problem *p_)
|
||||
{
|
||||
const plan_mpi_rdft *ego = (const plan_mpi_rdft *) ego_;
|
||||
const problem_mpi_rdft *p = (const problem_mpi_rdft *) p_;
|
||||
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
|
||||
}
|
||||
139
fftw-3.3.10/mpi/rdft2-problem.c
Normal file
139
fftw-3.3.10/mpi/rdft2-problem.c
Normal file
@@ -0,0 +1,139 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-rdft2.h"
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_mpi_rdft2 *ego = (problem_mpi_rdft2 *) ego_;
|
||||
XM(dtensor_destroy)(ego->sz);
|
||||
MPI_Comm_free(&ego->comm);
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
|
||||
int i;
|
||||
X(md5puts)(m, "mpi-rdft2");
|
||||
X(md5int)(m, p->I == p->O);
|
||||
/* don't include alignment -- may differ between processes
|
||||
X(md5int)(m, X(ialignment_of)(p->I));
|
||||
X(md5int)(m, X(ialignment_of)(p->O));
|
||||
... note that applicability of MPI plans does not depend
|
||||
on alignment (although optimality may, in principle). */
|
||||
XM(dtensor_md5)(m, p->sz);
|
||||
X(md5INT)(m, p->vn);
|
||||
X(md5int)(m, p->kind);
|
||||
X(md5int)(m, p->flags);
|
||||
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
|
||||
A(XM(md5_equal)(*m, p->comm));
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
|
||||
int i;
|
||||
p->print(p, "(mpi-rdft2 %d %d %d ",
|
||||
ego->I == ego->O,
|
||||
X(ialignment_of)(ego->I),
|
||||
X(ialignment_of)(ego->O));
|
||||
XM(dtensor_print)(ego->sz, p);
|
||||
p->print(p, " %D %d %d", ego->vn, (int) ego->kind, ego->flags);
|
||||
MPI_Comm_size(ego->comm, &i); p->print(p, " %d)", i);
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_mpi_rdft2 *ego = (const problem_mpi_rdft2 *) ego_;
|
||||
R *I = ego->I;
|
||||
dtensor *sz;
|
||||
INT i, N;
|
||||
int my_pe;
|
||||
|
||||
sz = XM(dtensor_copy)(ego->sz);
|
||||
sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
|
||||
MPI_Comm_rank(ego->comm, &my_pe);
|
||||
N = 2 * ego->vn * XM(total_block)(sz, IB, my_pe);
|
||||
XM(dtensor_destroy)(sz);
|
||||
for (i = 0; i < N; ++i) I[i] = K(0.0);
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_MPI_RDFT2,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
problem *XM(mkproblem_rdft2)(const dtensor *sz, INT vn,
|
||||
R *I, R *O,
|
||||
MPI_Comm comm,
|
||||
rdft_kind kind,
|
||||
unsigned flags)
|
||||
{
|
||||
problem_mpi_rdft2 *ego =
|
||||
(problem_mpi_rdft2 *)X(mkproblem)(sizeof(problem_mpi_rdft2), &padt);
|
||||
int n_pes;
|
||||
|
||||
A(XM(dtensor_validp)(sz) && FINITE_RNK(sz->rnk) && sz->rnk > 1);
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
A(vn >= 0);
|
||||
A(kind == R2HC || kind == HC2R);
|
||||
|
||||
/* enforce pointer equality if untainted pointers are equal */
|
||||
if (UNTAINT(I) == UNTAINT(O))
|
||||
I = O = JOIN_TAINT(I, O);
|
||||
|
||||
ego->sz = XM(dtensor_canonical)(sz, 0);
|
||||
#ifdef FFTW_DEBUG
|
||||
ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n / 2 + 1;
|
||||
A(n_pes >= XM(num_blocks_total)(ego->sz, IB)
|
||||
&& n_pes >= XM(num_blocks_total)(ego->sz, OB));
|
||||
ego->sz->dims[sz->rnk - 1].n = sz->dims[sz->rnk - 1].n;
|
||||
#endif
|
||||
|
||||
ego->vn = vn;
|
||||
ego->I = I;
|
||||
ego->O = O;
|
||||
ego->kind = kind;
|
||||
|
||||
/* We only support TRANSPOSED_OUT for r2c and TRANSPOSED_IN for
|
||||
c2r transforms. */
|
||||
|
||||
ego->flags = flags;
|
||||
|
||||
MPI_Comm_dup(comm, &ego->comm);
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
|
||||
problem *XM(mkproblem_rdft2_d)(dtensor *sz, INT vn,
|
||||
R *I, R *O,
|
||||
MPI_Comm comm,
|
||||
rdft_kind kind,
|
||||
unsigned flags)
|
||||
{
|
||||
problem *p = XM(mkproblem_rdft2)(sz, vn, I, O, comm, kind, flags);
|
||||
XM(dtensor_destroy)(sz);
|
||||
return p;
|
||||
}
|
||||
287
fftw-3.3.10/mpi/rdft2-rank-geq2-transposed.c
Normal file
287
fftw-3.3.10/mpi/rdft2-rank-geq2-transposed.c
Normal file
@@ -0,0 +1,287 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Real-input (r2c) DFTs of rank >= 2, for the case where we are distributed
|
||||
across the first dimension only, and the output is transposed both
|
||||
in data distribution and in ordering (for the first 2 dimensions).
|
||||
|
||||
Conversely, real-output (c2r) DFTs where the input is transposed.
|
||||
|
||||
We don't currently support transposed-input r2c or transposed-output
|
||||
c2r transforms. */
|
||||
|
||||
#include "mpi-rdft2.h"
|
||||
#include "mpi-transpose.h"
|
||||
#include "rdft/rdft.h"
|
||||
#include "dft/dft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft2 super;
|
||||
|
||||
plan *cld1, *cldt, *cld2;
|
||||
INT vn;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void apply_r2c(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld1;
|
||||
plan_dft *cld2;
|
||||
plan_rdft *cldt;
|
||||
|
||||
/* RDFT2 local dimensions */
|
||||
cld1 = (plan_rdft2 *) ego->cld1;
|
||||
if (ego->preserve_input) {
|
||||
cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
|
||||
|
||||
/* global transpose */
|
||||
cldt = (plan_rdft *) ego->cldt;
|
||||
cldt->apply(ego->cldt, I, O);
|
||||
|
||||
/* DFT final local dimension */
|
||||
cld2 = (plan_dft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, O, O+1, O, O+1);
|
||||
}
|
||||
|
||||
static void apply_c2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld1;
|
||||
plan_dft *cld2;
|
||||
plan_rdft *cldt;
|
||||
|
||||
/* IDFT local dimensions */
|
||||
cld2 = (plan_dft *) ego->cld2;
|
||||
if (ego->preserve_input) {
|
||||
cld2->apply(ego->cld2, I+1, I, O+1, O);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld2->apply(ego->cld2, I+1, I, I+1, I);
|
||||
|
||||
/* global transpose */
|
||||
cldt = (plan_rdft *) ego->cldt;
|
||||
cldt->apply(ego->cldt, I, O);
|
||||
|
||||
/* RDFT2 final local dimension */
|
||||
cld1 = (plan_rdft2 *) ego->cld1;
|
||||
cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk > 1
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& ((p->flags == TRANSPOSED_OUT && p->kind == R2HC
|
||||
&& XM(is_local_after)(1, p->sz, IB)
|
||||
&& XM(is_local_after)(2, p->sz, OB)
|
||||
&& XM(num_blocks)(p->sz->dims[0].n,
|
||||
p->sz->dims[0].b[OB]) == 1)
|
||||
||
|
||||
(p->flags == TRANSPOSED_IN && p->kind == HC2R
|
||||
&& XM(is_local_after)(1, p->sz, OB)
|
||||
&& XM(is_local_after)(2, p->sz, IB)
|
||||
&& XM(num_blocks)(p->sz->dims[0].n,
|
||||
p->sz->dims[0].b[IB]) == 1))
|
||||
&& (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
|
||||
|| !XM(rdft2_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cldt, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cldt);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-rdft2-rank-geq2-transposed%s%(%p%)%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"",
|
||||
ego->cld1, ego->cldt, ego->cld2);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_rdft2 *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cldt = 0, *cld2 = 0;
|
||||
R *r0, *r1, *cr, *ci, *ri, *ii, *ro, *io, *I, *O;
|
||||
tensor *sz;
|
||||
int i, my_pe, n_pes;
|
||||
INT nrest, n1, b1;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
block_kind k1, k2;
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_rdft2 *) p_;
|
||||
|
||||
I = p->I; O = p->O;
|
||||
if (p->kind == R2HC) {
|
||||
k1 = IB; k2 = OB;
|
||||
r1 = (r0 = I) + p->vn;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
|
||||
ci = (cr = O) + 1;
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
ci = (cr = I) + 1;
|
||||
io = ii = (ro = ri = O) + 1;
|
||||
}
|
||||
else {
|
||||
k1 = OB; k2 = IB;
|
||||
r1 = (r0 = O) + p->vn;
|
||||
ci = (cr = O) + 1;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
|
||||
ri = (ii = I) + 1;
|
||||
ro = (io = O) + 1;
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
ro = ri = (io = ii = I) + 1;
|
||||
}
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
|
||||
i = p->sz->rnk - 2; A(i >= 0);
|
||||
sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
|
||||
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
|
||||
for (--i; i >= 0; --i) {
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
|
||||
}
|
||||
nrest = 1; for (i = 1; i < sz->rnk; ++i) nrest *= sz->dims[i].n;
|
||||
{
|
||||
INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
|
||||
INT is = sz->dims[0].n * sz->dims[0].is;
|
||||
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[k1], my_pe);
|
||||
sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(sz,
|
||||
X(mktensor_2d)(b, is, is,
|
||||
p->vn,ivs,ovs),
|
||||
r0, r1, cr, ci, p->kind));
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
nrest *= p->vn;
|
||||
n1 = p->sz->dims[1].n;
|
||||
b1 = p->sz->dims[1].b[k2];
|
||||
if (p->sz->rnk == 2) { /* n1 dimension is cut in ~half */
|
||||
n1 = n1 / 2 + 1;
|
||||
b1 = b1 == p->sz->dims[1].n ? n1 : b1;
|
||||
}
|
||||
|
||||
if (p->kind == R2HC)
|
||||
cldt = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
p->sz->dims[0].n, n1, nrest * 2,
|
||||
I, O,
|
||||
p->sz->dims[0].b[IB], b1,
|
||||
p->comm, 0));
|
||||
else
|
||||
cldt = X(mkplan_d)(plnr,
|
||||
XM(mkproblem_transpose)(
|
||||
n1, p->sz->dims[0].n, nrest * 2,
|
||||
I, O,
|
||||
b1, p->sz->dims[0].b[OB],
|
||||
p->comm, 0));
|
||||
if (XM(any_true)(!cldt, p->comm)) goto nada;
|
||||
|
||||
{
|
||||
INT is = p->sz->dims[0].n * nrest * 2;
|
||||
INT b = XM(block)(n1, b1, my_pe);
|
||||
cld2 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_dft_d)(X(mktensor_1d)(
|
||||
p->sz->dims[0].n,
|
||||
nrest * 2, nrest * 2),
|
||||
X(mktensor_2d)(b, is, is,
|
||||
nrest, 2, 2),
|
||||
ri, ii, ro, io));
|
||||
if (XM(any_true)(!cld2, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
|
||||
pln->cld1 = cld1;
|
||||
pln->cldt = cldt;
|
||||
pln->cld2 = cld2;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->vn = p->vn;
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
X(ops_add2)(&cldt->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cldt);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(rdft2_rank_geq2_transposed_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
215
fftw-3.3.10/mpi/rdft2-rank-geq2.c
Normal file
215
fftw-3.3.10/mpi/rdft2-rank-geq2.c
Normal file
@@ -0,0 +1,215 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Complex RDFT2s of rank >= 2, for the case where we are distributed
|
||||
across the first dimension only, and the output is not transposed. */
|
||||
|
||||
#include "mpi-dft.h"
|
||||
#include "mpi-rdft2.h"
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft2 super;
|
||||
|
||||
plan *cld1, *cld2;
|
||||
INT vn;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void apply_r2c(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld1;
|
||||
plan_rdft *cld2;
|
||||
|
||||
/* RDFT2 local dimensions */
|
||||
cld1 = (plan_rdft2 *) ego->cld1;
|
||||
if (ego->preserve_input) {
|
||||
cld1->apply(ego->cld1, I, I+ego->vn, O, O+1);
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
cld1->apply(ego->cld1, I, I+ego->vn, I, I+1);
|
||||
|
||||
/* DFT non-local dimension (via dft-rank1-bigvec, usually): */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, I, O);
|
||||
}
|
||||
|
||||
static void apply_c2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld1;
|
||||
plan_rdft *cld2;
|
||||
|
||||
/* DFT non-local dimension (via dft-rank1-bigvec, usually): */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, I, O);
|
||||
|
||||
/* RDFT2 local dimensions */
|
||||
cld1 = (plan_rdft2 *) ego->cld1;
|
||||
cld1->apply(ego->cld1, O, O+ego->vn, O, O+1);
|
||||
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
|
||||
return (1
|
||||
&& p->sz->rnk > 1
|
||||
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O
|
||||
&& p->kind == R2HC))
|
||||
&& XM(is_local_after)(1, p->sz, IB)
|
||||
&& XM(is_local_after)(1, p->sz, OB)
|
||||
&& (!NO_SLOWP(plnr) /* slow if rdft2-serial is applicable */
|
||||
|| !XM(rdft2_serial_applicable)(p))
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-rdft2-rank-geq2%s%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"", ego->cld1, ego->cld2);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_rdft2 *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0;
|
||||
R *r0, *r1, *cr, *ci, *I, *O;
|
||||
tensor *sz;
|
||||
dtensor *sz2;
|
||||
int i, my_pe, n_pes;
|
||||
INT nrest;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_rdft2 *) p_;
|
||||
|
||||
I = p->I; O = p->O;
|
||||
if (p->kind == R2HC) {
|
||||
r1 = (r0 = p->I) + p->vn;
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) {
|
||||
ci = (cr = p->O) + 1;
|
||||
I = O;
|
||||
}
|
||||
else
|
||||
ci = (cr = p->I) + 1;
|
||||
}
|
||||
else {
|
||||
r1 = (r0 = p->O) + p->vn;
|
||||
ci = (cr = p->O) + 1;
|
||||
}
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
sz = X(mktensor)(p->sz->rnk - 1); /* tensor of last rnk-1 dimensions */
|
||||
i = p->sz->rnk - 2; A(i >= 0);
|
||||
sz->dims[i].is = sz->dims[i].os = 2 * p->vn;
|
||||
sz->dims[i].n = p->sz->dims[i+1].n / 2 + 1;
|
||||
for (--i; i >= 0; --i) {
|
||||
sz->dims[i].n = p->sz->dims[i+1].n;
|
||||
sz->dims[i].is = sz->dims[i].os = sz->dims[i+1].n * sz->dims[i+1].is;
|
||||
}
|
||||
nrest = X(tensor_sz)(sz);
|
||||
{
|
||||
INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
|
||||
INT is = sz->dims[0].n * sz->dims[0].is;
|
||||
INT b = XM(block)(p->sz->dims[0].n, p->sz->dims[0].b[IB], my_pe);
|
||||
sz->dims[p->sz->rnk - 2].n = p->sz->dims[p->sz->rnk - 1].n;
|
||||
cld1 = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(sz,
|
||||
X(mktensor_2d)(b, is, is,
|
||||
p->vn,ivs,ovs),
|
||||
r0, r1, cr, ci, p->kind));
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
sz2 = XM(mkdtensor)(1); /* tensor for first (distributed) dimension */
|
||||
sz2->dims[0] = p->sz->dims[0];
|
||||
cld2 = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz2, nrest * p->vn,
|
||||
I, O, p->comm,
|
||||
p->kind == R2HC ?
|
||||
FFT_SIGN : -FFT_SIGN,
|
||||
RANK1_BIGVEC_ONLY));
|
||||
if (XM(any_true)(!cld2, p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->vn = p->vn;
|
||||
|
||||
X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(rdft2_rank_geq2_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
144
fftw-3.3.10/mpi/rdft2-serial.c
Normal file
144
fftw-3.3.10/mpi/rdft2-serial.c
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* "MPI" DFTs where all of the data is on one processor...just
|
||||
call through to serial API. */
|
||||
|
||||
#include "mpi-rdft2.h"
|
||||
#include "rdft/rdft.h"
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_rdft2 super;
|
||||
plan *cld;
|
||||
INT vn;
|
||||
} P;
|
||||
|
||||
static void apply_r2c(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld;
|
||||
cld = (plan_rdft2 *) ego->cld;
|
||||
cld->apply(ego->cld, I, I+ego->vn, O, O+1);
|
||||
}
|
||||
|
||||
static void apply_c2r(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft2 *cld;
|
||||
cld = (plan_rdft2 *) ego->cld;
|
||||
cld->apply(ego->cld, O, O+ego->vn, I, I+1);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cld);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-rdft2-serial %(%p%))", ego->cld);
|
||||
}
|
||||
|
||||
int XM(rdft2_serial_applicable)(const problem_mpi_rdft2 *p)
|
||||
{
|
||||
return (1
|
||||
&& p->flags == 0 /* TRANSPOSED/SCRAMBLED_IN/OUT not supported */
|
||||
&& ((XM(is_local)(p->sz, IB) && XM(is_local)(p->sz, OB))
|
||||
|| p->vn == 0));
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
|
||||
{
|
||||
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
|
||||
P *pln;
|
||||
plan *cld;
|
||||
int my_pe;
|
||||
R *r0, *r1, *cr, *ci;
|
||||
static const plan_adt padt = {
|
||||
XM(rdft2_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
/* check whether applicable: */
|
||||
if (!XM(rdft2_serial_applicable)(p))
|
||||
return (plan *) 0;
|
||||
|
||||
if (p->kind == R2HC) {
|
||||
r1 = (r0 = p->I) + p->vn;
|
||||
ci = (cr = p->O) + 1;
|
||||
}
|
||||
else {
|
||||
r1 = (r0 = p->O) + p->vn;
|
||||
ci = (cr = p->I) + 1;
|
||||
}
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
if (my_pe == 0 && p->vn > 0) {
|
||||
INT ivs = 1 + (p->kind == HC2R), ovs = 1 + (p->kind == R2HC);
|
||||
int i, rnk = p->sz->rnk;
|
||||
tensor *sz = X(mktensor)(p->sz->rnk);
|
||||
sz->dims[rnk - 1].is = sz->dims[rnk - 1].os = 2 * p->vn;
|
||||
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n / 2 + 1;
|
||||
for (i = rnk - 1; i > 0; --i) {
|
||||
sz->dims[i - 1].is = sz->dims[i - 1].os =
|
||||
sz->dims[i].is * sz->dims[i].n;
|
||||
sz->dims[i - 1].n = p->sz->dims[i - 1].n;
|
||||
}
|
||||
sz->dims[rnk - 1].n = p->sz->dims[rnk - 1].n;
|
||||
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(sz,
|
||||
X(mktensor_1d)(p->vn,ivs,ovs),
|
||||
r0, r1, cr, ci, p->kind));
|
||||
}
|
||||
else { /* idle process: make nop plan */
|
||||
cld = X(mkplan_d)(plnr,
|
||||
X(mkproblem_rdft2_d)(X(mktensor_0d)(),
|
||||
X(mktensor_1d)(0,0,0),
|
||||
cr, ci, cr, ci, HC2R));
|
||||
}
|
||||
if (XM(any_true)(!cld, p->comm)) return (plan *) 0;
|
||||
|
||||
pln = MKPLAN_MPI_RDFT2(P, &padt, p->kind == R2HC ? apply_r2c : apply_c2r);
|
||||
pln->cld = cld;
|
||||
pln->vn = p->vn;
|
||||
X(ops_cpy)(&cld->ops, &pln->super.super.ops);
|
||||
return &(pln->super.super);
|
||||
}
|
||||
|
||||
static solver *mksolver(void)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_RDFT2, mkplan, 0 };
|
||||
return MKSOLVER(solver, &sadt);
|
||||
}
|
||||
|
||||
void XM(rdft2_serial_register)(planner *p)
|
||||
{
|
||||
REGISTER_SOLVER(p, mksolver());
|
||||
}
|
||||
29
fftw-3.3.10/mpi/rdft2-solve.c
Normal file
29
fftw-3.3.10/mpi/rdft2-solve.c
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-rdft2.h"
|
||||
|
||||
/* use the apply() operation for MPI_RDFT2 problems */
|
||||
void XM(rdft2_solve)(const plan *ego_, const problem *p_)
|
||||
{
|
||||
const plan_mpi_rdft2 *ego = (const plan_mpi_rdft2 *) ego_;
|
||||
const problem_mpi_rdft2 *p = (const problem_mpi_rdft2 *) p_;
|
||||
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
|
||||
}
|
||||
65
fftw-3.3.10/mpi/rearrange.c
Normal file
65
fftw-3.3.10/mpi/rearrange.c
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ifftw-mpi.h"
|
||||
|
||||
/* common functions for rearrangements of the data for the *-rank1-bigvec
|
||||
solvers */
|
||||
|
||||
static int div_mult(INT b, INT a) {
|
||||
return (a > b && a % b == 0);
|
||||
}
|
||||
static int div_mult2(INT b, INT a, INT n) {
|
||||
return (div_mult(b, a) && div_mult(n, b));
|
||||
}
|
||||
|
||||
int XM(rearrange_applicable)(rearrangement rearrange,
|
||||
ddim dim0, INT vn, int n_pes)
|
||||
{
|
||||
/* note: it is important that cases other than CONTIG be
|
||||
applicable only when the resulting transpose dimension
|
||||
is divisible by n_pes; otherwise, the allocation size
|
||||
returned by the API will be incorrect */
|
||||
return ((rearrange != DISCONTIG || div_mult(n_pes, vn))
|
||||
&& (rearrange != SQUARE_BEFORE
|
||||
|| div_mult2(dim0.b[IB], vn, n_pes))
|
||||
&& (rearrange != SQUARE_AFTER
|
||||
|| (dim0.b[IB] != dim0.b[OB]
|
||||
&& div_mult2(dim0.b[OB], vn, n_pes)))
|
||||
&& (rearrange != SQUARE_MIDDLE
|
||||
|| div_mult(dim0.n * n_pes, vn)));
|
||||
}
|
||||
|
||||
INT XM(rearrange_ny)(rearrangement rearrange, ddim dim0, INT vn, int n_pes)
|
||||
{
|
||||
switch (rearrange) {
|
||||
case CONTIG:
|
||||
return vn;
|
||||
case DISCONTIG:
|
||||
return n_pes;
|
||||
case SQUARE_BEFORE:
|
||||
return dim0.b[IB];
|
||||
case SQUARE_AFTER:
|
||||
return dim0.b[OB];
|
||||
case SQUARE_MIDDLE:
|
||||
return dim0.n * n_pes;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
552
fftw-3.3.10/mpi/testsched.c
Normal file
552
fftw-3.3.10/mpi/testsched.c
Normal file
@@ -0,0 +1,552 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/**********************************************************************/
|
||||
/* This is a modified and combined version of the sched.c and
|
||||
test_sched.c files shipped with FFTW 2, written to implement and
|
||||
test various all-to-all communications scheduling patterns.
|
||||
|
||||
It is not used in FFTW 3, but I keep it around in case we ever want
|
||||
to play with this again or to change algorithms. In particular, I
|
||||
used it to implement and test the fill1_comm_sched routine in
|
||||
transpose-pairwise.c, which allows us to create a schedule for one
|
||||
process at a time and is much more compact than the FFTW 2 code.
|
||||
|
||||
Note that the scheduling algorithm is somewhat modified from that
|
||||
of FFTW 2. Originally, I thought that one "stall" in the schedule
|
||||
was unavoidable for odd numbers of processes, since this is the
|
||||
case for the soccer-timetabling problem. However, because of the
|
||||
self-communication step, we can use the self-communication to fill
|
||||
in the stalls. (Thanks to Ralf Wildenhues for pointing this out.)
|
||||
This greatly simplifies the process re-sorting algorithm. */
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* This file contains routines to compute communications schedules for
|
||||
all-to-all communications (complete exchanges) that are performed
|
||||
in-place. (That is, the block that processor x sends to processor
|
||||
y gets replaced on processor x by a block received from processor y.)
|
||||
|
||||
A schedule, int **sched, is a two-dimensional array where
|
||||
sched[pe][i] is the processor that pe expects to exchange a message
|
||||
with on the i-th step of the exchange. sched[pe][i] == -1 for the
|
||||
i after the last exchange scheduled on pe.
|
||||
|
||||
Here, processors (pe's, for processing elements), are numbered from
|
||||
0 to npes-1.
|
||||
|
||||
There are a couple of constraints that a schedule should satisfy
|
||||
(besides the obvious one that every processor has to communicate
|
||||
with every other processor exactly once).
|
||||
|
||||
* First, and most importantly, there must be no deadlocks.
|
||||
|
||||
* Second, we would like to overlap communications as much as possible,
|
||||
so that all exchanges occur in parallel. It turns out that perfect
|
||||
overlap is possible for all number of processes (npes).
|
||||
|
||||
It turns out that this scheduling problem is actually well-studied,
|
||||
and good solutions are known. The problem is known as a
|
||||
"time-tabling" problem, and is specifically the problem of
|
||||
scheduling a sports competition (where n teams must compete exactly
|
||||
once with every other team). The problem is discussed and
|
||||
algorithms are presented in:
|
||||
|
||||
[1] J. A. M. Schreuder, "Constructing Timetables for Sport
|
||||
Competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
|
||||
|
||||
[2] A. Schaerf, "Scheduling Sport Tournaments using Constraint
|
||||
Logic Programming," Proc. of 12th Europ. Conf. on
|
||||
Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996).
|
||||
http://hermes.dis.uniromal.it/~aschaerf/publications.html
|
||||
|
||||
(These people actually impose a lot of additional constraints that
|
||||
we don't care about, so they are solving harder problems. [1] gives
|
||||
a simple enough algorithm for our purposes, though.)
|
||||
|
||||
In the timetabling problem, N teams can all play one another in N-1
|
||||
steps if N is even, and N steps if N is odd. Here, however,
|
||||
there is a "self-communication" step (a team must also "play itself")
|
||||
and so we can always make an optimal N-step schedule regardless of N.
|
||||
|
||||
However, we have to do more: for a particular processor, the
|
||||
communications schedule must be sorted in ascending or descending
|
||||
order of processor index. (This is necessary so that the data
|
||||
coming in for the transpose does not overwrite data that will be
|
||||
sent later; for that processor the incoming and outgoing blocks are
|
||||
of different non-zero sizes.) Fortunately, because the schedule
|
||||
is stall free, each parallel step of the schedule is independent
|
||||
of every other step, and we can reorder the steps arbitrarily
|
||||
to achieve any desired order on a particular process.
|
||||
*/
|
||||
|
||||
void free_comm_schedule(int **sched, int npes)
|
||||
{
|
||||
if (sched) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < npes; ++i)
|
||||
free(sched[i]);
|
||||
free(sched);
|
||||
}
|
||||
}
|
||||
|
||||
void empty_comm_schedule(int **sched, int npes)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < npes; ++i)
|
||||
sched[i][0] = -1;
|
||||
}
|
||||
|
||||
extern void fill_comm_schedule(int **sched, int npes);
|
||||
|
||||
/* Create a new communications schedule for a given number of processors.
|
||||
The schedule is initialized to a deadlock-free, maximum overlap
|
||||
schedule. Returns NULL on an error (may print a message to
|
||||
stderr if there is a program bug detected). */
|
||||
int **make_comm_schedule(int npes)
|
||||
{
|
||||
int **sched;
|
||||
int i;
|
||||
|
||||
sched = (int **) malloc(sizeof(int *) * npes);
|
||||
if (!sched)
|
||||
return NULL;
|
||||
|
||||
for (i = 0; i < npes; ++i)
|
||||
sched[i] = NULL;
|
||||
|
||||
for (i = 0; i < npes; ++i) {
|
||||
sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1));
|
||||
if (!sched[i]) {
|
||||
free_comm_schedule(sched,npes);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
empty_comm_schedule(sched,npes);
|
||||
fill_comm_schedule(sched,npes);
|
||||
|
||||
if (!check_comm_schedule(sched,npes)) {
|
||||
free_comm_schedule(sched,npes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return sched;
|
||||
}
|
||||
|
||||
static void add_dest_to_comm_schedule(int **sched, int pe, int dest)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; sched[pe][i] != -1; ++i)
|
||||
;
|
||||
|
||||
sched[pe][i] = dest;
|
||||
sched[pe][i+1] = -1;
|
||||
}
|
||||
|
||||
static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2)
|
||||
{
|
||||
add_dest_to_comm_schedule(sched, pe1, pe2);
|
||||
if (pe1 != pe2)
|
||||
add_dest_to_comm_schedule(sched, pe2, pe1);
|
||||
}
|
||||
|
||||
/* Simplification of algorithm presented in [1] (we have fewer
|
||||
constraints). Produces a perfect schedule (npes steps). */
|
||||
|
||||
void fill_comm_schedule(int **sched, int npes)
|
||||
{
|
||||
int pe, i, n;
|
||||
|
||||
if (npes % 2 == 0) {
|
||||
n = npes;
|
||||
for (pe = 0; pe < npes; ++pe)
|
||||
add_pair_to_comm_schedule(sched,pe,pe);
|
||||
}
|
||||
else
|
||||
n = npes + 1;
|
||||
|
||||
for (pe = 0; pe < n - 1; ++pe) {
|
||||
add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe);
|
||||
|
||||
for (i = 1; i < n/2; ++i) {
|
||||
int pe_a, pe_b;
|
||||
|
||||
pe_a = pe - i;
|
||||
if (pe_a < 0)
|
||||
pe_a += n - 1;
|
||||
|
||||
pe_b = (pe + i) % (n - 1);
|
||||
|
||||
add_pair_to_comm_schedule(sched,pe_a,pe_b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* given an array sched[npes], fills it with the communications
|
||||
schedule for process pe. */
|
||||
void fill1_comm_sched(int *sched, int which_pe, int npes)
|
||||
{
|
||||
int pe, i, n, s = 0;
|
||||
if (npes % 2 == 0) {
|
||||
n = npes;
|
||||
sched[s++] = which_pe;
|
||||
}
|
||||
else
|
||||
n = npes + 1;
|
||||
for (pe = 0; pe < n - 1; ++pe) {
|
||||
if (npes % 2 == 0) {
|
||||
if (pe == which_pe) sched[s++] = npes - 1;
|
||||
else if (npes - 1 == which_pe) sched[s++] = pe;
|
||||
}
|
||||
else if (pe == which_pe) sched[s++] = pe;
|
||||
|
||||
if (pe != which_pe && which_pe < n - 1) {
|
||||
i = (pe - which_pe + (n - 1)) % (n - 1);
|
||||
if (i < n/2)
|
||||
sched[s++] = (pe + i) % (n - 1);
|
||||
|
||||
i = (which_pe - pe + (n - 1)) % (n - 1);
|
||||
if (i < n/2)
|
||||
sched[s++] = (pe - i + (n - 1)) % (n - 1);
|
||||
}
|
||||
}
|
||||
if (s != npes) {
|
||||
fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n",
|
||||
s, which_pe, npes);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
/* sort the communication schedule sched for npes so that the schedule
|
||||
on process sortpe is ascending or descending (!ascending). */
|
||||
static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
|
||||
{
|
||||
int *sortsched, i;
|
||||
sortsched = (int *) malloc(npes * sizeof(int) * 2);
|
||||
fill1_comm_sched(sortsched, sortpe, npes);
|
||||
if (ascending)
|
||||
for (i = 0; i < npes; ++i)
|
||||
sortsched[npes + sortsched[i]] = sched[i];
|
||||
else
|
||||
for (i = 0; i < npes; ++i)
|
||||
sortsched[2*npes - 1 - sortsched[i]] = sched[i];
|
||||
for (i = 0; i < npes; ++i)
|
||||
sched[i] = sortsched[npes + i];
|
||||
free(sortsched);
|
||||
}
|
||||
|
||||
/* Below, we have various checks in case of bugs: */
|
||||
|
||||
/* check for deadlocks by simulating the schedule and looking for
|
||||
cycles in the dependency list; returns 0 if there are deadlocks
|
||||
(or other errors) */
|
||||
static int check_schedule_deadlock(int **sched, int npes)
|
||||
{
|
||||
int *step, *depend, *visited, pe, pe2, period, done = 0;
|
||||
int counter = 0;
|
||||
|
||||
/* step[pe] is the step in the schedule that a given pe is on */
|
||||
step = (int *) malloc(sizeof(int) * npes);
|
||||
|
||||
/* depend[pe] is the pe' that pe is currently waiting for a message
|
||||
from (-1 if none) */
|
||||
depend = (int *) malloc(sizeof(int) * npes);
|
||||
|
||||
/* visited[pe] tells whether we have visited the current pe already
|
||||
when we are looking for cycles. */
|
||||
visited = (int *) malloc(sizeof(int) * npes);
|
||||
|
||||
if (!step || !depend || !visited) {
|
||||
free(step); free(depend); free(visited);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (pe = 0; pe < npes; ++pe)
|
||||
step[pe] = 0;
|
||||
|
||||
while (!done) {
|
||||
++counter;
|
||||
|
||||
for (pe = 0; pe < npes; ++pe)
|
||||
depend[pe] = sched[pe][step[pe]];
|
||||
|
||||
/* now look for cycles in the dependencies with period > 2: */
|
||||
for (pe = 0; pe < npes; ++pe)
|
||||
if (depend[pe] != -1) {
|
||||
for (pe2 = 0; pe2 < npes; ++pe2)
|
||||
visited[pe2] = 0;
|
||||
|
||||
period = 0;
|
||||
pe2 = pe;
|
||||
do {
|
||||
visited[pe2] = period + 1;
|
||||
pe2 = depend[pe2];
|
||||
period++;
|
||||
} while (pe2 != -1 && !visited[pe2]);
|
||||
|
||||
if (pe2 == -1) {
|
||||
fprintf(stderr,
|
||||
"BUG: unterminated cycle in schedule!\n");
|
||||
free(step); free(depend);
|
||||
free(visited);
|
||||
return 0;
|
||||
}
|
||||
if (period - (visited[pe2] - 1) > 2) {
|
||||
fprintf(stderr,"BUG: deadlock in schedule!\n");
|
||||
free(step); free(depend);
|
||||
free(visited);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (pe2 == pe)
|
||||
step[pe]++;
|
||||
}
|
||||
|
||||
done = 1;
|
||||
for (pe = 0; pe < npes; ++pe)
|
||||
if (sched[pe][step[pe]] != -1) {
|
||||
done = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free(step); free(depend); free(visited);
|
||||
return (counter > 0 ? counter : 1);
|
||||
}
|
||||
|
||||
/* sanity checks; prints message and returns 0 on failure.
|
||||
undocumented feature: the return value on success is actually the
|
||||
number of steps required for the schedule to complete, counting
|
||||
stalls. */
|
||||
int check_comm_schedule(int **sched, int npes)
|
||||
{
|
||||
int pe, i, comm_pe;
|
||||
|
||||
for (pe = 0; pe < npes; ++pe) {
|
||||
for (comm_pe = 0; comm_pe < npes; ++comm_pe) {
|
||||
for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i)
|
||||
;
|
||||
if (sched[pe][i] == -1) {
|
||||
fprintf(stderr,"BUG: schedule never sends message from "
|
||||
"%d to %d.\n",pe,comm_pe);
|
||||
return 0; /* never send message to comm_pe */
|
||||
}
|
||||
}
|
||||
for (i = 0; sched[pe][i] != -1; ++i)
|
||||
;
|
||||
if (i != npes) {
|
||||
fprintf(stderr,"BUG: schedule sends too many messages from "
|
||||
"%d\n",pe);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return check_schedule_deadlock(sched,npes);
|
||||
}
|
||||
|
||||
/* invert the order of all the schedules; this has no effect on
|
||||
its required properties. */
|
||||
void invert_comm_schedule(int **sched, int npes)
|
||||
{
|
||||
int pe, i;
|
||||
|
||||
for (pe = 0; pe < npes; ++pe)
|
||||
for (i = 0; i < npes/2; ++i) {
|
||||
int dummy = sched[pe][i];
|
||||
sched[pe][i] = sched[pe][npes-1-i];
|
||||
sched[pe][npes-1-i] = dummy;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort the schedule for sort_pe in ascending order of processor
|
||||
index. Unfortunately, for odd npes (when schedule has a stall
|
||||
to begin with) this will introduce an extra stall due to
|
||||
the motion of the self-communication past a stall. We could
|
||||
fix this if it were really important. Actually, we don't
|
||||
get an extra stall when sort_pe == 0 or npes-1, which is sufficient
|
||||
for our purposes. */
|
||||
void sort_comm_schedule(int **sched, int npes, int sort_pe)
|
||||
{
|
||||
int i,j,pe;
|
||||
|
||||
/* Note that we can do this sort in O(npes) swaps because we know
|
||||
that the numbers we are sorting are just 0...npes-1. But we'll
|
||||
just do a bubble sort for simplicity here. */
|
||||
|
||||
for (i = 0; i < npes - 1; ++i)
|
||||
for (j = i + 1; j < npes; ++j)
|
||||
if (sched[sort_pe][i] > sched[sort_pe][j]) {
|
||||
for (pe = 0; pe < npes; ++pe) {
|
||||
int s = sched[pe][i];
|
||||
sched[pe][i] = sched[pe][j];
|
||||
sched[pe][j] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* print the schedule (for debugging purposes) */
|
||||
void print_comm_schedule(int **sched, int npes)
|
||||
{
|
||||
int pe, i, width;
|
||||
|
||||
if (npes < 10)
|
||||
width = 1;
|
||||
else if (npes < 100)
|
||||
width = 2;
|
||||
else
|
||||
width = 3;
|
||||
|
||||
for (pe = 0; pe < npes; ++pe) {
|
||||
printf("pe %*d schedule:", width, pe);
|
||||
for (i = 0; sched[pe][i] != -1; ++i)
|
||||
printf(" %*d",width,sched[pe][i]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int **sched;
|
||||
int npes = -1, sortpe = -1, steps, i;
|
||||
|
||||
if (argc >= 2) {
|
||||
npes = atoi(argv[1]);
|
||||
if (npes <= 0) {
|
||||
fprintf(stderr,"npes must be positive!");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (argc >= 3) {
|
||||
sortpe = atoi(argv[2]);
|
||||
if (sortpe < 0 || sortpe >= npes) {
|
||||
fprintf(stderr,"sortpe must be between 0 and npes-1.\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (npes != -1) {
|
||||
printf("Computing schedule for npes = %d:\n",npes);
|
||||
sched = make_comm_schedule(npes);
|
||||
if (!sched) {
|
||||
fprintf(stderr,"Out of memory!");
|
||||
return 6;
|
||||
}
|
||||
|
||||
if (steps = check_comm_schedule(sched,npes))
|
||||
printf("schedule OK (takes %d steps to complete).\n", steps);
|
||||
else
|
||||
printf("schedule not OK.\n");
|
||||
|
||||
print_comm_schedule(sched, npes);
|
||||
|
||||
if (sortpe != -1) {
|
||||
printf("\nRe-creating schedule for pe = %d...\n", sortpe);
|
||||
int *sched1 = (int*) malloc(sizeof(int) * npes);
|
||||
for (i = 0; i < npes; ++i) sched1[i] = -1;
|
||||
fill1_comm_sched(sched1, sortpe, npes);
|
||||
printf(" =");
|
||||
for (i = 0; i < npes; ++i)
|
||||
printf(" %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3),
|
||||
sched1[i]);
|
||||
printf("\n");
|
||||
|
||||
printf("\nSorting schedule for sortpe = %d...\n", sortpe);
|
||||
sort_comm_schedule(sched,npes,sortpe);
|
||||
|
||||
if (steps = check_comm_schedule(sched,npes))
|
||||
printf("schedule OK (takes %d steps to complete).\n",
|
||||
steps);
|
||||
else
|
||||
printf("schedule not OK.\n");
|
||||
|
||||
print_comm_schedule(sched, npes);
|
||||
|
||||
printf("\nInverting schedule...\n");
|
||||
invert_comm_schedule(sched,npes);
|
||||
|
||||
if (steps = check_comm_schedule(sched,npes))
|
||||
printf("schedule OK (takes %d steps to complete).\n",
|
||||
steps);
|
||||
else
|
||||
printf("schedule not OK.\n");
|
||||
|
||||
print_comm_schedule(sched, npes);
|
||||
|
||||
free_comm_schedule(sched,npes);
|
||||
|
||||
free(sched1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
printf("Doing infinite tests...\n");
|
||||
for (npes = 1; ; ++npes) {
|
||||
int *sched1 = (int*) malloc(sizeof(int) * npes);
|
||||
printf("npes = %d...",npes);
|
||||
sched = make_comm_schedule(npes);
|
||||
if (!sched) {
|
||||
fprintf(stderr,"Out of memory!\n");
|
||||
return 5;
|
||||
}
|
||||
for (sortpe = 0; sortpe < npes; ++sortpe) {
|
||||
empty_comm_schedule(sched,npes);
|
||||
fill_comm_schedule(sched,npes);
|
||||
if (!check_comm_schedule(sched,npes)) {
|
||||
fprintf(stderr,
|
||||
"\n -- fill error for sortpe = %d!\n",sortpe);
|
||||
return 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < npes; ++i) sched1[i] = -1;
|
||||
fill1_comm_sched(sched1, sortpe, npes);
|
||||
for (i = 0; i < npes; ++i)
|
||||
if (sched1[i] != sched[sortpe][i])
|
||||
fprintf(stderr,
|
||||
"\n -- fill1 error for pe = %d!\n",
|
||||
sortpe);
|
||||
|
||||
sort_comm_schedule(sched,npes,sortpe);
|
||||
if (!check_comm_schedule(sched,npes)) {
|
||||
fprintf(stderr,
|
||||
"\n -- sort error for sortpe = %d!\n",sortpe);
|
||||
return 3;
|
||||
}
|
||||
invert_comm_schedule(sched,npes);
|
||||
if (!check_comm_schedule(sched,npes)) {
|
||||
fprintf(stderr,
|
||||
"\n -- invert error for sortpe = %d!\n",
|
||||
sortpe);
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
free_comm_schedule(sched,npes);
|
||||
printf("OK\n");
|
||||
if (npes % 50 == 0)
|
||||
printf("(...Hit Ctrl-C to stop...)\n");
|
||||
free(sched1);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
265
fftw-3.3.10/mpi/transpose-alltoall.c
Normal file
265
fftw-3.3.10/mpi/transpose-alltoall.c
Normal file
@@ -0,0 +1,265 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* plans for distributed out-of-place transpose using MPI_Alltoall,
|
||||
and which destroy the input array (unless TRANSPOSED_IN is used) */
|
||||
|
||||
#include "mpi-transpose.h"
|
||||
#include <string.h>
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int copy_transposed_in; /* whether to copy the input for TRANSPOSED_IN,
|
||||
which makes the final transpose out-of-place
|
||||
but costs an extra copy and requires us
|
||||
to destroy the input */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_transpose super;
|
||||
|
||||
plan *cld1, *cld2, *cld2rest, *cld3;
|
||||
|
||||
MPI_Comm comm;
|
||||
int *send_block_sizes, *send_block_offsets;
|
||||
int *recv_block_sizes, *recv_block_offsets;
|
||||
|
||||
INT rest_Ioff, rest_Ooff;
|
||||
|
||||
int equal_blocks;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld1, *cld2, *cld2rest, *cld3;
|
||||
|
||||
/* transpose locally to get contiguous chunks */
|
||||
cld1 = (plan_rdft *) ego->cld1;
|
||||
if (cld1) {
|
||||
cld1->apply(ego->cld1, I, O);
|
||||
|
||||
/* transpose chunks globally */
|
||||
if (ego->equal_blocks)
|
||||
MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
|
||||
I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
|
||||
ego->comm);
|
||||
else
|
||||
MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
|
||||
FFTW_MPI_TYPE,
|
||||
I, ego->recv_block_sizes, ego->recv_block_offsets,
|
||||
FFTW_MPI_TYPE,
|
||||
ego->comm);
|
||||
}
|
||||
else { /* TRANSPOSED_IN, no need to destroy input */
|
||||
/* transpose chunks globally */
|
||||
if (ego->equal_blocks)
|
||||
MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
|
||||
O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
|
||||
ego->comm);
|
||||
else
|
||||
MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
|
||||
FFTW_MPI_TYPE,
|
||||
O, ego->recv_block_sizes, ego->recv_block_offsets,
|
||||
FFTW_MPI_TYPE,
|
||||
ego->comm);
|
||||
I = O; /* final transpose (if any) is in-place */
|
||||
}
|
||||
|
||||
/* transpose locally, again, to get ordinary row-major */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
if (cld2) {
|
||||
cld2->apply(ego->cld2, I, O);
|
||||
cld2rest = (plan_rdft *) ego->cld2rest;
|
||||
if (cld2rest) { /* leftover from unequal block sizes */
|
||||
cld2rest->apply(ego->cld2rest,
|
||||
I + ego->rest_Ioff, O + ego->rest_Ooff);
|
||||
cld3 = (plan_rdft *) ego->cld3;
|
||||
if (cld3)
|
||||
cld3->apply(ego->cld3, O, O);
|
||||
/* else TRANSPOSED_OUT is true and user wants O transposed */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
|
||||
return (1
|
||||
&& p->I != p->O
|
||||
&& (!NO_DESTROY_INPUTP(plnr) ||
|
||||
((p->flags & TRANSPOSED_IN) && !ego->copy_transposed_in))
|
||||
&& ((p->flags & TRANSPOSED_IN) || !ego->copy_transposed_in)
|
||||
&& ONLY_TRANSPOSEDP(p->flags)
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
X(plan_awake)(ego->cld2rest, wakefulness);
|
||||
X(plan_awake)(ego->cld3, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(ifree0)(ego->send_block_sizes);
|
||||
MPI_Comm_free(&ego->comm);
|
||||
X(plan_destroy_internal)(ego->cld3);
|
||||
X(plan_destroy_internal)(ego->cld2rest);
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-transpose-alltoall%s%(%p%)%(%p%)%(%p%)%(%p%))",
|
||||
ego->equal_blocks ? "/e" : "",
|
||||
ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_transpose *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
|
||||
INT b, bt, vn, rest_Ioff, rest_Ooff;
|
||||
R *I;
|
||||
int *sbs, *sbo, *rbs, *rbo;
|
||||
int pe, my_pe, n_pes;
|
||||
int equal_blocks = 1;
|
||||
static const plan_adt padt = {
|
||||
XM(transpose_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_transpose *) p_;
|
||||
vn = p->vn;
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
b = XM(block)(p->nx, p->block, my_pe);
|
||||
|
||||
if (p->flags & TRANSPOSED_IN) { /* I is already transposed */
|
||||
if (ego->copy_transposed_in) {
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_1d)
|
||||
(b * p->ny * vn, 1, 1),
|
||||
I = p->I, p->O),
|
||||
0, 0, NO_SLOW);
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
else
|
||||
I = p->O; /* final transpose is in-place */
|
||||
}
|
||||
else { /* transpose b x ny x vn -> ny x b x vn */
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_3d)
|
||||
(b, p->ny * vn, vn,
|
||||
p->ny, vn, b * vn,
|
||||
vn, 1, 1),
|
||||
I = p->I, p->O),
|
||||
0, 0, NO_SLOW);
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
|
||||
if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, p->O, my_pe,
|
||||
&cld2, &cld2rest, &cld3,
|
||||
&rest_Ioff, &rest_Ooff),
|
||||
p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
|
||||
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->cld2rest = cld2rest;
|
||||
pln->rest_Ioff = rest_Ioff;
|
||||
pln->rest_Ooff = rest_Ooff;
|
||||
pln->cld3 = cld3;
|
||||
|
||||
MPI_Comm_dup(p->comm, &pln->comm);
|
||||
|
||||
/* Compute sizes/offsets of blocks to send for all-to-all command. */
|
||||
sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
|
||||
sbo = sbs + n_pes;
|
||||
rbs = sbo + n_pes;
|
||||
rbo = rbs + n_pes;
|
||||
b = XM(block)(p->nx, p->block, my_pe);
|
||||
bt = XM(block)(p->ny, p->tblock, my_pe);
|
||||
for (pe = 0; pe < n_pes; ++pe) {
|
||||
INT db, dbt; /* destination block sizes */
|
||||
db = XM(block)(p->nx, p->block, pe);
|
||||
dbt = XM(block)(p->ny, p->tblock, pe);
|
||||
if (db != p->block || dbt != p->tblock)
|
||||
equal_blocks = 0;
|
||||
|
||||
/* MPI requires type "int" here; apparently it
|
||||
has no 64-bit API? Grrr. */
|
||||
sbs[pe] = (int) (b * dbt * vn);
|
||||
sbo[pe] = (int) (pe * (b * p->tblock) * vn);
|
||||
rbs[pe] = (int) (db * bt * vn);
|
||||
rbo[pe] = (int) (pe * (p->block * bt) * vn);
|
||||
}
|
||||
pln->send_block_sizes = sbs;
|
||||
pln->send_block_offsets = sbo;
|
||||
pln->recv_block_sizes = rbs;
|
||||
pln->recv_block_offsets = rbo;
|
||||
pln->equal_blocks = equal_blocks;
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
|
||||
if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
|
||||
if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
|
||||
if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
|
||||
/* FIXME: should MPI operations be counted in "other" somehow? */
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld3);
|
||||
X(plan_destroy_internal)(cld2rest);
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int copy_transposed_in)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->copy_transposed_in = copy_transposed_in;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(transpose_alltoall_register)(planner *p)
|
||||
{
|
||||
int cti;
|
||||
for (cti = 0; cti <= 1; ++cti)
|
||||
REGISTER_SOLVER(p, mksolver(cti));
|
||||
}
|
||||
487
fftw-3.3.10/mpi/transpose-pairwise.c
Normal file
487
fftw-3.3.10/mpi/transpose-pairwise.c
Normal file
@@ -0,0 +1,487 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Distributed transposes using a sequence of carefully scheduled
|
||||
pairwise exchanges. This has the advantage that it can be done
|
||||
in-place, or out-of-place while preserving the input, using buffer
|
||||
space proportional to the local size divided by the number of
|
||||
processes (i.e. to the total array size divided by the number of
|
||||
processes squared). */
|
||||
|
||||
#include "mpi-transpose.h"
|
||||
#include <string.h>
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_transpose super;
|
||||
|
||||
plan *cld1, *cld2, *cld2rest, *cld3;
|
||||
INT rest_Ioff, rest_Ooff;
|
||||
|
||||
int n_pes, my_pe, *sched;
|
||||
INT *send_block_sizes, *send_block_offsets;
|
||||
INT *recv_block_sizes, *recv_block_offsets;
|
||||
MPI_Comm comm;
|
||||
int preserve_input;
|
||||
} P;
|
||||
|
||||
static void transpose_chunks(int *sched, int n_pes, int my_pe,
|
||||
INT *sbs, INT *sbo, INT *rbs, INT *rbo,
|
||||
MPI_Comm comm,
|
||||
R *I, R *O)
|
||||
{
|
||||
if (sched) {
|
||||
int i;
|
||||
MPI_Status status;
|
||||
|
||||
/* TODO: explore non-synchronous send/recv? */
|
||||
|
||||
if (I == O) {
|
||||
R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
|
||||
|
||||
for (i = 0; i < n_pes; ++i) {
|
||||
int pe = sched[i];
|
||||
if (my_pe == pe) {
|
||||
if (rbo[pe] != sbo[pe])
|
||||
memmove(O + rbo[pe], O + sbo[pe],
|
||||
sbs[pe] * sizeof(R));
|
||||
}
|
||||
else {
|
||||
memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
|
||||
MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
|
||||
pe, (my_pe * n_pes + pe) & 0x7fff,
|
||||
O + rbo[pe], (int) (rbs[pe]),
|
||||
FFTW_MPI_TYPE,
|
||||
pe, (pe * n_pes + my_pe) & 0x7fff,
|
||||
comm, &status);
|
||||
}
|
||||
}
|
||||
|
||||
X(ifree)(buf);
|
||||
}
|
||||
else { /* I != O */
|
||||
for (i = 0; i < n_pes; ++i) {
|
||||
int pe = sched[i];
|
||||
if (my_pe == pe)
|
||||
memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
|
||||
else
|
||||
MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
|
||||
FFTW_MPI_TYPE,
|
||||
pe, (my_pe * n_pes + pe) & 0x7fff,
|
||||
O + rbo[pe], (int) (rbs[pe]),
|
||||
FFTW_MPI_TYPE,
|
||||
pe, (pe * n_pes + my_pe) & 0x7fff,
|
||||
comm, &status);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld1, *cld2, *cld2rest, *cld3;
|
||||
|
||||
/* transpose locally to get contiguous chunks */
|
||||
cld1 = (plan_rdft *) ego->cld1;
|
||||
if (cld1) {
|
||||
cld1->apply(ego->cld1, I, O);
|
||||
|
||||
if (ego->preserve_input) I = O;
|
||||
|
||||
/* transpose chunks globally */
|
||||
transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
|
||||
ego->send_block_sizes, ego->send_block_offsets,
|
||||
ego->recv_block_sizes, ego->recv_block_offsets,
|
||||
ego->comm, O, I);
|
||||
}
|
||||
else if (ego->preserve_input) {
|
||||
/* transpose chunks globally */
|
||||
transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
|
||||
ego->send_block_sizes, ego->send_block_offsets,
|
||||
ego->recv_block_sizes, ego->recv_block_offsets,
|
||||
ego->comm, I, O);
|
||||
|
||||
I = O;
|
||||
}
|
||||
else {
|
||||
/* transpose chunks globally */
|
||||
transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
|
||||
ego->send_block_sizes, ego->send_block_offsets,
|
||||
ego->recv_block_sizes, ego->recv_block_offsets,
|
||||
ego->comm, I, I);
|
||||
}
|
||||
|
||||
/* transpose locally, again, to get ordinary row-major;
|
||||
this may take two transposes if the block sizes are unequal
|
||||
(3 subplans, two of which operate on disjoint data) */
|
||||
cld2 = (plan_rdft *) ego->cld2;
|
||||
cld2->apply(ego->cld2, I, O);
|
||||
cld2rest = (plan_rdft *) ego->cld2rest;
|
||||
if (cld2rest) {
|
||||
cld2rest->apply(ego->cld2rest,
|
||||
I + ego->rest_Ioff, O + ego->rest_Ooff);
|
||||
cld3 = (plan_rdft *) ego->cld3;
|
||||
if (cld3)
|
||||
cld3->apply(ego->cld3, O, O);
|
||||
/* else TRANSPOSED_OUT is true and user wants O transposed */
|
||||
}
|
||||
}
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr)
|
||||
{
|
||||
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
|
||||
/* Note: this is *not* UGLY for out-of-place, destroy-input plans;
|
||||
the planner often prefers transpose-pairwise to transpose-alltoall,
|
||||
at least with LAM MPI on my machine. */
|
||||
return (1
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& ONLY_TRANSPOSEDP(p->flags));
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cld2, wakefulness);
|
||||
X(plan_awake)(ego->cld2rest, wakefulness);
|
||||
X(plan_awake)(ego->cld3, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(ifree0)(ego->sched);
|
||||
X(ifree0)(ego->send_block_sizes);
|
||||
MPI_Comm_free(&ego->comm);
|
||||
X(plan_destroy_internal)(ego->cld3);
|
||||
X(plan_destroy_internal)(ego->cld2rest);
|
||||
X(plan_destroy_internal)(ego->cld2);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-transpose-pairwise%s%(%p%)%(%p%)%(%p%)%(%p%))",
|
||||
ego->preserve_input==2 ?"/p":"",
|
||||
ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
|
||||
}
|
||||
|
||||
/* Given a process which_pe and a number of processes npes, fills
|
||||
the array sched[npes] with a sequence of processes to communicate
|
||||
with for a deadlock-free, optimum-overlap all-to-all communication.
|
||||
(All processes must call this routine to get their own schedules.)
|
||||
The schedule can be re-ordered arbitrarily as long as all processes
|
||||
apply the same permutation to their schedules.
|
||||
|
||||
The algorithm here is based upon the one described in:
|
||||
J. A. M. Schreuder, "Constructing timetables for sport
|
||||
competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
|
||||
In a sport competition, you have N teams and want every team to
|
||||
play every other team in as short a time as possible (maximum overlap
|
||||
between games). This timetabling problem is therefore identical
|
||||
to that of an all-to-all communications problem. In our case, there
|
||||
is one wrinkle: as part of the schedule, the process must do
|
||||
some data transfer with itself (local data movement), analogous
|
||||
to a requirement that each team "play itself" in addition to other
|
||||
teams. With this wrinkle, it turns out that an optimal timetable
|
||||
(N parallel games) can be constructed for any N, not just for even
|
||||
N as in the original problem described by Schreuder.
|
||||
*/
|
||||
static void fill1_comm_sched(int *sched, int which_pe, int npes)
|
||||
{
|
||||
int pe, i, n, s = 0;
|
||||
A(which_pe >= 0 && which_pe < npes);
|
||||
if (npes % 2 == 0) {
|
||||
n = npes;
|
||||
sched[s++] = which_pe;
|
||||
}
|
||||
else
|
||||
n = npes + 1;
|
||||
for (pe = 0; pe < n - 1; ++pe) {
|
||||
if (npes % 2 == 0) {
|
||||
if (pe == which_pe) sched[s++] = npes - 1;
|
||||
else if (npes - 1 == which_pe) sched[s++] = pe;
|
||||
}
|
||||
else if (pe == which_pe) sched[s++] = pe;
|
||||
|
||||
if (pe != which_pe && which_pe < n - 1) {
|
||||
i = (pe - which_pe + (n - 1)) % (n - 1);
|
||||
if (i < n/2)
|
||||
sched[s++] = (pe + i) % (n - 1);
|
||||
|
||||
i = (which_pe - pe + (n - 1)) % (n - 1);
|
||||
if (i < n/2)
|
||||
sched[s++] = (pe - i + (n - 1)) % (n - 1);
|
||||
}
|
||||
}
|
||||
A(s == npes);
|
||||
}
|
||||
|
||||
/* Sort the communication schedule sched for npes so that the schedule
|
||||
on process sortpe is ascending or descending (!ascending). This is
|
||||
necessary to allow in-place transposes when the problem does not
|
||||
divide equally among the processes. In this case there is one
|
||||
process where the incoming blocks are bigger/smaller than the
|
||||
outgoing blocks and thus have to be received in
|
||||
descending/ascending order, respectively, to avoid overwriting data
|
||||
before it is sent. */
|
||||
static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
|
||||
{
|
||||
int *sortsched, i;
|
||||
sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
|
||||
fill1_comm_sched(sortsched, sortpe, npes);
|
||||
if (ascending)
|
||||
for (i = 0; i < npes; ++i)
|
||||
sortsched[npes + sortsched[i]] = sched[i];
|
||||
else
|
||||
for (i = 0; i < npes; ++i)
|
||||
sortsched[2*npes - 1 - sortsched[i]] = sched[i];
|
||||
for (i = 0; i < npes; ++i)
|
||||
sched[i] = sortsched[npes + i];
|
||||
X(ifree)(sortsched);
|
||||
}
|
||||
|
||||
/* make the plans to do the post-MPI transpositions (shared with
|
||||
transpose-alltoall) */
|
||||
int XM(mkplans_posttranspose)(const problem_mpi_transpose *p, planner *plnr,
|
||||
R *I, R *O, int my_pe,
|
||||
plan **cld2, plan **cld2rest, plan **cld3,
|
||||
INT *rest_Ioff, INT *rest_Ooff)
|
||||
{
|
||||
INT vn = p->vn;
|
||||
INT b = p->block;
|
||||
INT bt = XM(block)(p->ny, p->tblock, my_pe);
|
||||
INT nxb = p->nx / b; /* number of equal-sized blocks */
|
||||
INT nxr = p->nx - nxb * b; /* leftover rows after equal blocks */
|
||||
|
||||
*cld2 = *cld2rest = *cld3 = NULL;
|
||||
*rest_Ioff = *rest_Ooff = 0;
|
||||
|
||||
if (!(p->flags & TRANSPOSED_OUT) && (nxr == 0 || I != O)) {
|
||||
INT nx = p->nx * vn;
|
||||
b *= vn;
|
||||
*cld2 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_3d)
|
||||
(nxb, bt * b, b,
|
||||
bt, b, nx,
|
||||
b, 1, 1),
|
||||
I, O),
|
||||
0, 0, NO_SLOW);
|
||||
if (!*cld2) goto nada;
|
||||
|
||||
if (nxr > 0) {
|
||||
*rest_Ioff = nxb * bt * b;
|
||||
*rest_Ooff = nxb * b;
|
||||
b = nxr * vn;
|
||||
*cld2rest = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_2d)
|
||||
(bt, b, nx,
|
||||
b, 1, 1),
|
||||
I + *rest_Ioff,
|
||||
O + *rest_Ooff),
|
||||
0, 0, NO_SLOW);
|
||||
if (!*cld2rest) goto nada;
|
||||
}
|
||||
}
|
||||
else {
|
||||
*cld2 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(mktensor_4d)
|
||||
(nxb, bt * b * vn, bt * b * vn,
|
||||
bt, b * vn, vn,
|
||||
b, vn, bt * vn,
|
||||
vn, 1, 1),
|
||||
I, O),
|
||||
0, 0, NO_SLOW);
|
||||
if (!*cld2) goto nada;
|
||||
|
||||
*rest_Ioff = *rest_Ooff = nxb * bt * b * vn;
|
||||
*cld2rest = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(mktensor_3d)
|
||||
(bt, nxr * vn, vn,
|
||||
nxr, vn, bt * vn,
|
||||
vn, 1, 1),
|
||||
I + *rest_Ioff, O + *rest_Ooff),
|
||||
0, 0, NO_SLOW);
|
||||
if (!*cld2rest) goto nada;
|
||||
|
||||
if (!(p->flags & TRANSPOSED_OUT)) {
|
||||
*cld3 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(
|
||||
X(mktensor_3d)
|
||||
(p->nx, bt * vn, vn,
|
||||
bt, vn, p->nx * vn,
|
||||
vn, 1, 1),
|
||||
O, O),
|
||||
0, 0, NO_SLOW);
|
||||
if (!*cld3) goto nada;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(*cld3);
|
||||
X(plan_destroy_internal)(*cld2rest);
|
||||
X(plan_destroy_internal)(*cld2);
|
||||
*cld2 = *cld2rest = *cld3 = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_transpose *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
|
||||
INT b, bt, vn, rest_Ioff, rest_Ooff;
|
||||
INT *sbs, *sbo, *rbs, *rbo;
|
||||
int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
|
||||
R *I, *O;
|
||||
static const plan_adt padt = {
|
||||
XM(transpose_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_transpose *) p_;
|
||||
vn = p->vn;
|
||||
I = p->I; O = p->O;
|
||||
|
||||
MPI_Comm_rank(p->comm, &my_pe);
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
|
||||
b = XM(block)(p->nx, p->block, my_pe);
|
||||
|
||||
if (!(p->flags & TRANSPOSED_IN)) { /* b x ny x vn -> ny x b x vn */
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_3d)
|
||||
(b, p->ny * vn, vn,
|
||||
p->ny, vn, b * vn,
|
||||
vn, 1, 1),
|
||||
I, O),
|
||||
0, 0, NO_SLOW);
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
}
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
|
||||
|
||||
if (XM(any_true)(!XM(mkplans_posttranspose)(p, plnr, I, O, my_pe,
|
||||
&cld2, &cld2rest, &cld3,
|
||||
&rest_Ioff, &rest_Ooff),
|
||||
p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
|
||||
|
||||
pln->cld1 = cld1;
|
||||
pln->cld2 = cld2;
|
||||
pln->cld2rest = cld2rest;
|
||||
pln->rest_Ioff = rest_Ioff;
|
||||
pln->rest_Ooff = rest_Ooff;
|
||||
pln->cld3 = cld3;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
|
||||
MPI_Comm_dup(p->comm, &pln->comm);
|
||||
|
||||
n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
|
||||
XM(num_blocks)(p->ny, p->tblock));
|
||||
|
||||
/* Compute sizes/offsets of blocks to exchange between processors */
|
||||
sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
|
||||
sbo = sbs + n_pes;
|
||||
rbs = sbo + n_pes;
|
||||
rbo = rbs + n_pes;
|
||||
b = XM(block)(p->nx, p->block, my_pe);
|
||||
bt = XM(block)(p->ny, p->tblock, my_pe);
|
||||
for (pe = 0; pe < n_pes; ++pe) {
|
||||
INT db, dbt; /* destination block sizes */
|
||||
db = XM(block)(p->nx, p->block, pe);
|
||||
dbt = XM(block)(p->ny, p->tblock, pe);
|
||||
|
||||
sbs[pe] = b * dbt * vn;
|
||||
sbo[pe] = pe * (b * p->tblock) * vn;
|
||||
rbs[pe] = db * bt * vn;
|
||||
rbo[pe] = pe * (p->block * bt) * vn;
|
||||
|
||||
if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
|
||||
A(sort_pe == -1); /* only one process should need sorting */
|
||||
sort_pe = pe;
|
||||
ascending = db * p->tblock > p->block * dbt;
|
||||
}
|
||||
}
|
||||
pln->n_pes = n_pes;
|
||||
pln->my_pe = my_pe;
|
||||
pln->send_block_sizes = sbs;
|
||||
pln->send_block_offsets = sbo;
|
||||
pln->recv_block_sizes = rbs;
|
||||
pln->recv_block_offsets = rbo;
|
||||
|
||||
if (my_pe >= n_pes) {
|
||||
pln->sched = 0; /* this process is not doing anything */
|
||||
}
|
||||
else {
|
||||
pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
|
||||
fill1_comm_sched(pln->sched, my_pe, n_pes);
|
||||
if (sort_pe >= 0)
|
||||
sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
|
||||
}
|
||||
|
||||
X(ops_zero)(&pln->super.super.ops);
|
||||
if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
|
||||
if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
|
||||
if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
|
||||
if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
|
||||
/* FIXME: should MPI operations be counted in "other" somehow? */
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cld3);
|
||||
X(plan_destroy_internal)(cld2rest);
|
||||
X(plan_destroy_internal)(cld2);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(transpose_pairwise_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input));
|
||||
}
|
||||
123
fftw-3.3.10/mpi/transpose-problem.c
Normal file
123
fftw-3.3.10/mpi/transpose-problem.c
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-transpose.h"
|
||||
|
||||
static void destroy(problem *ego_)
|
||||
{
|
||||
problem_mpi_transpose *ego = (problem_mpi_transpose *) ego_;
|
||||
MPI_Comm_free(&ego->comm);
|
||||
X(ifree)(ego_);
|
||||
}
|
||||
|
||||
static void hash(const problem *p_, md5 *m)
|
||||
{
|
||||
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
|
||||
int i;
|
||||
X(md5puts)(m, "mpi-transpose");
|
||||
X(md5int)(m, p->I == p->O);
|
||||
/* don't include alignment -- may differ between processes
|
||||
X(md5int)(m, X(ialignment_of)(p->I));
|
||||
X(md5int)(m, X(ialignment_of)(p->O));
|
||||
... note that applicability of MPI plans does not depend
|
||||
on alignment (although optimality may, in principle). */
|
||||
X(md5INT)(m, p->vn);
|
||||
X(md5INT)(m, p->nx);
|
||||
X(md5INT)(m, p->ny);
|
||||
X(md5INT)(m, p->block);
|
||||
X(md5INT)(m, p->tblock);
|
||||
MPI_Comm_size(p->comm, &i); X(md5int)(m, i);
|
||||
A(XM(md5_equal)(*m, p->comm));
|
||||
}
|
||||
|
||||
static void print(const problem *ego_, printer *p)
|
||||
{
|
||||
const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
|
||||
int i;
|
||||
MPI_Comm_size(ego->comm, &i);
|
||||
p->print(p, "(mpi-transpose %d %d %d %D %D %D %D %D %d)",
|
||||
ego->I == ego->O,
|
||||
X(ialignment_of)(ego->I),
|
||||
X(ialignment_of)(ego->O),
|
||||
ego->vn,
|
||||
ego->nx, ego->ny,
|
||||
ego->block, ego->tblock,
|
||||
i);
|
||||
}
|
||||
|
||||
static void zero(const problem *ego_)
|
||||
{
|
||||
const problem_mpi_transpose *ego = (const problem_mpi_transpose *) ego_;
|
||||
R *I = ego->I;
|
||||
INT i, N = ego->vn * ego->ny;
|
||||
int my_pe;
|
||||
|
||||
MPI_Comm_rank(ego->comm, &my_pe);
|
||||
N *= XM(block)(ego->nx, ego->block, my_pe);
|
||||
|
||||
for (i = 0; i < N; ++i) I[i] = K(0.0);
|
||||
}
|
||||
|
||||
static const problem_adt padt =
|
||||
{
|
||||
PROBLEM_MPI_TRANSPOSE,
|
||||
hash,
|
||||
zero,
|
||||
print,
|
||||
destroy
|
||||
};
|
||||
|
||||
problem *XM(mkproblem_transpose)(INT nx, INT ny, INT vn,
|
||||
R *I, R *O,
|
||||
INT block, INT tblock,
|
||||
MPI_Comm comm,
|
||||
unsigned flags)
|
||||
{
|
||||
problem_mpi_transpose *ego =
|
||||
(problem_mpi_transpose *)X(mkproblem)(sizeof(problem_mpi_transpose), &padt);
|
||||
|
||||
A(nx > 0 && ny > 0 && vn > 0);
|
||||
A(block > 0 && XM(num_blocks_ok)(nx, block, comm)
|
||||
&& tblock > 0 && XM(num_blocks_ok)(ny, tblock, comm));
|
||||
|
||||
/* enforce pointer equality if untainted pointers are equal */
|
||||
if (UNTAINT(I) == UNTAINT(O))
|
||||
I = O = JOIN_TAINT(I, O);
|
||||
|
||||
ego->nx = nx;
|
||||
ego->ny = ny;
|
||||
ego->vn = vn;
|
||||
ego->I = I;
|
||||
ego->O = O;
|
||||
ego->block = block > nx ? nx : block;
|
||||
ego->tblock = tblock > ny ? ny : tblock;
|
||||
|
||||
/* canonicalize flags: we can freely assume that the data is
|
||||
"transposed" if one of the dimensions is 1. */
|
||||
if (ego->block == 1)
|
||||
flags |= TRANSPOSED_IN;
|
||||
if (ego->tblock == 1)
|
||||
flags |= TRANSPOSED_OUT;
|
||||
ego->flags = flags;
|
||||
|
||||
MPI_Comm_dup(comm, &ego->comm);
|
||||
|
||||
return &(ego->super);
|
||||
}
|
||||
300
fftw-3.3.10/mpi/transpose-recurse.c
Normal file
300
fftw-3.3.10/mpi/transpose-recurse.c
Normal file
@@ -0,0 +1,300 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* Recursive "radix-r" distributed transpose, which breaks a transpose
|
||||
over p processes into p/r transposes over r processes plus r
|
||||
transposes over p/r processes. If performed recursively, this
|
||||
produces a total of O(p log p) messages vs. O(p^2) messages for a
|
||||
direct approach.
|
||||
|
||||
However, this is not necessarily an improvement. The total size of
|
||||
all the messages is actually increased from O(N) to O(N log p)
|
||||
where N is the total data size. Also, the amount of local data
|
||||
rearrangement is increased. So, it's not clear, a priori, what the
|
||||
best algorithm will be, and we'll leave it to the planner. (In
|
||||
theory and practice, it looks like this becomes advantageous for
|
||||
large p, in the limit where the message sizes are small and
|
||||
latency-dominated.)
|
||||
*/
|
||||
|
||||
#include "mpi-transpose.h"
|
||||
#include <string.h>
|
||||
|
||||
typedef struct {
|
||||
solver super;
|
||||
int (*radix)(int np);
|
||||
const char *nam;
|
||||
int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
|
||||
} S;
|
||||
|
||||
typedef struct {
|
||||
plan_mpi_transpose super;
|
||||
|
||||
plan *cld1, *cldtr, *cldtm;
|
||||
int preserve_input;
|
||||
|
||||
int r; /* "radix" */
|
||||
const char *nam;
|
||||
} P;
|
||||
|
||||
static void apply(const plan *ego_, R *I, R *O)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
plan_rdft *cld1, *cldtr, *cldtm;
|
||||
|
||||
cld1 = (plan_rdft *) ego->cld1;
|
||||
if (cld1) cld1->apply((plan *) cld1, I, O);
|
||||
|
||||
if (ego->preserve_input) I = O;
|
||||
|
||||
cldtr = (plan_rdft *) ego->cldtr;
|
||||
if (cldtr) cldtr->apply((plan *) cldtr, O, I);
|
||||
|
||||
cldtm = (plan_rdft *) ego->cldtm;
|
||||
if (cldtm) cldtm->apply((plan *) cldtm, I, O);
|
||||
}
|
||||
|
||||
static int radix_sqrt(int np)
|
||||
{
|
||||
int r;
|
||||
for (r = (int) (X(isqrt)(np)); np % r != 0; ++r)
|
||||
;
|
||||
return r;
|
||||
}
|
||||
|
||||
static int radix_first(int np)
|
||||
{
|
||||
int r = (int) (X(first_divisor)(np));
|
||||
return (r >= (int) (X(isqrt)(np)) ? 0 : r);
|
||||
}
|
||||
|
||||
/* the local allocated space on process pe required for the given transpose
|
||||
dimensions and block sizes */
|
||||
static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe)
|
||||
{
|
||||
return X(imax)(XM(block)(nx, block, pe) * ny,
|
||||
nx * XM(block)(ny, tblock, pe));
|
||||
}
|
||||
|
||||
/* check whether the recursive transposes fit within the space
|
||||
that must have been allocated on each process for this transpose;
|
||||
this must be modified if the subdivision in mkplan is changed! */
|
||||
static int enough_space(INT nx, INT ny, INT block, INT tblock,
|
||||
int r, int n_pes)
|
||||
{
|
||||
int pe;
|
||||
int m = n_pes / r;
|
||||
for (pe = 0; pe < n_pes; ++pe) {
|
||||
INT space = transpose_space(nx, ny, block, tblock, pe);
|
||||
INT b1 = XM(block)(nx, r * block, pe / r);
|
||||
INT b2 = XM(block)(ny, m * tblock, pe % r);
|
||||
if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
|
||||
|| transpose_space(nx, b2, r*block, tblock, pe / r) > space)
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* In theory, transpose-recurse becomes advantageous for message sizes
|
||||
below some minimum, assuming that the time is dominated by
|
||||
communications. In practice, we want to constrain the minimum
|
||||
message size for transpose-recurse to keep the planning time down.
|
||||
I've set this conservatively according to some simple experiments
|
||||
on a Cray XT3 where the crossover message size was 128, although on
|
||||
a larger-latency machine the crossover will be larger. */
|
||||
#define SMALL_MESSAGE 2048
|
||||
|
||||
static int applicable(const S *ego, const problem *p_,
|
||||
const planner *plnr, int *r)
|
||||
{
|
||||
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
|
||||
int n_pes;
|
||||
MPI_Comm_size(p->comm, &n_pes);
|
||||
return (1
|
||||
&& p->tblock * n_pes == p->ny
|
||||
&& (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
|
||||
&& p->I != p->O))
|
||||
&& (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1
|
||||
&& enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes)
|
||||
&& (!CONSERVE_MEMORYP(plnr) || *r > 8
|
||||
|| !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r))
|
||||
&& (!NO_SLOWP(plnr) ||
|
||||
(p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE)
|
||||
&& ONLY_TRANSPOSEDP(p->flags)
|
||||
);
|
||||
}
|
||||
|
||||
static void awake(plan *ego_, enum wakefulness wakefulness)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_awake)(ego->cld1, wakefulness);
|
||||
X(plan_awake)(ego->cldtr, wakefulness);
|
||||
X(plan_awake)(ego->cldtm, wakefulness);
|
||||
}
|
||||
|
||||
static void destroy(plan *ego_)
|
||||
{
|
||||
P *ego = (P *) ego_;
|
||||
X(plan_destroy_internal)(ego->cldtm);
|
||||
X(plan_destroy_internal)(ego->cldtr);
|
||||
X(plan_destroy_internal)(ego->cld1);
|
||||
}
|
||||
|
||||
static void print(const plan *ego_, printer *p)
|
||||
{
|
||||
const P *ego = (const P *) ego_;
|
||||
p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))",
|
||||
ego->nam, ego->r, ego->preserve_input==2 ?"/p":"",
|
||||
ego->cld1, ego->cldtr, ego->cldtm);
|
||||
}
|
||||
|
||||
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
||||
{
|
||||
const S *ego = (const S *) ego_;
|
||||
const problem_mpi_transpose *p;
|
||||
P *pln;
|
||||
plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
|
||||
R *I, *O;
|
||||
int me, np, r, m;
|
||||
INT b;
|
||||
MPI_Comm comm2;
|
||||
static const plan_adt padt = {
|
||||
XM(transpose_solve), awake, print, destroy
|
||||
};
|
||||
|
||||
UNUSED(ego);
|
||||
|
||||
if (!applicable(ego, p_, plnr, &r))
|
||||
return (plan *) 0;
|
||||
|
||||
p = (const problem_mpi_transpose *) p_;
|
||||
|
||||
MPI_Comm_size(p->comm, &np);
|
||||
MPI_Comm_rank(p->comm, &me);
|
||||
m = np / r;
|
||||
A(r * m == np);
|
||||
|
||||
I = p->I; O = p->O;
|
||||
|
||||
b = XM(block)(p->nx, p->block, me);
|
||||
A(p->tblock * np == p->ny); /* this is currently required for cld1 */
|
||||
if (p->flags & TRANSPOSED_IN) {
|
||||
/* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
|
||||
INT vn = p->vn * b * p->tblock;
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_3d)
|
||||
(m, r*vn, vn,
|
||||
r, vn, m*vn,
|
||||
vn, 1, 1),
|
||||
I, O),
|
||||
0, 0, NO_SLOW);
|
||||
}
|
||||
else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
|
||||
/* b x m x r x bt x vn -> r x m x bt x b x vn */
|
||||
INT vn = p->vn;
|
||||
INT bt = p->tblock;
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_5d)
|
||||
(b, m*r*bt*vn, vn,
|
||||
m, r*bt*vn, bt*b*vn,
|
||||
r, bt*vn, m*bt*b*vn,
|
||||
bt, vn, b*vn,
|
||||
vn, 1, 1),
|
||||
I, O),
|
||||
0, 0, NO_SLOW);
|
||||
}
|
||||
else { /* TRANSPOSED_IN permutation must be separate for in-place */
|
||||
/* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
|
||||
INT vn = p->vn * p->tblock;
|
||||
cld1 = X(mkplan_f_d)(plnr,
|
||||
X(mkproblem_rdft_0_d)(X(mktensor_4d)
|
||||
(m, r*vn, vn,
|
||||
r, vn, m*vn,
|
||||
vn, 1, 1,
|
||||
b, np*vn, np*vn),
|
||||
I, O),
|
||||
0, 0, NO_SLOW);
|
||||
}
|
||||
if (XM(any_true)(!cld1, p->comm)) goto nada;
|
||||
|
||||
if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
|
||||
|
||||
b = XM(block)(p->nx, r * p->block, me / r);
|
||||
MPI_Comm_split(p->comm, me / r, me, &comm2);
|
||||
if (b)
|
||||
cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
|
||||
(b, p->ny, p->vn,
|
||||
O, I, p->block, m * p->tblock, comm2,
|
||||
p->I != p->O
|
||||
? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
|
||||
MPI_Comm_free(&comm2);
|
||||
if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
|
||||
|
||||
b = XM(block)(p->ny, m * p->tblock, me % r);
|
||||
MPI_Comm_split(p->comm, me % r, me, &comm2);
|
||||
if (b)
|
||||
cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
|
||||
(p->nx, b, p->vn,
|
||||
I, O, r * p->block, p->tblock, comm2,
|
||||
TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
|
||||
MPI_Comm_free(&comm2);
|
||||
if (XM(any_true)(b && !cldtm, p->comm)) goto nada;
|
||||
|
||||
pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
|
||||
|
||||
pln->cld1 = cld1;
|
||||
pln->cldtr = cldtr;
|
||||
pln->cldtm = cldtm;
|
||||
pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
|
||||
pln->r = r;
|
||||
pln->nam = ego->nam;
|
||||
|
||||
pln->super.super.ops = cld1->ops;
|
||||
if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
|
||||
if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);
|
||||
|
||||
return &(pln->super.super);
|
||||
|
||||
nada:
|
||||
X(plan_destroy_internal)(cldtm);
|
||||
X(plan_destroy_internal)(cldtr);
|
||||
X(plan_destroy_internal)(cld1);
|
||||
return (plan *) 0;
|
||||
}
|
||||
|
||||
static solver *mksolver(int preserve_input,
|
||||
int (*radix)(int np), const char *nam)
|
||||
{
|
||||
static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
|
||||
S *slv = MKSOLVER(S, &sadt);
|
||||
slv->preserve_input = preserve_input;
|
||||
slv->radix = radix;
|
||||
slv->nam = nam;
|
||||
return &(slv->super);
|
||||
}
|
||||
|
||||
void XM(transpose_recurse_register)(planner *p)
|
||||
{
|
||||
int preserve_input;
|
||||
for (preserve_input = 0; preserve_input <= 1; ++preserve_input) {
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt"));
|
||||
REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first"));
|
||||
}
|
||||
}
|
||||
29
fftw-3.3.10/mpi/transpose-solve.c
Normal file
29
fftw-3.3.10/mpi/transpose-solve.c
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "mpi-transpose.h"
|
||||
|
||||
/* use the apply() operation for MPI_TRANSPOSE problems */
|
||||
void XM(transpose_solve)(const plan *ego_, const problem *p_)
|
||||
{
|
||||
const plan_mpi_transpose *ego = (const plan_mpi_transpose *) ego_;
|
||||
const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
|
||||
ego->apply(ego_, UNTAINT(p->I), UNTAINT(p->O));
|
||||
}
|
||||
112
fftw-3.3.10/mpi/wisdom-api.c
Normal file
112
fftw-3.3.10/mpi/wisdom-api.c
Normal file
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "fftw3-mpi.h"
|
||||
#include "ifftw-mpi.h"
|
||||
#include <string.h>
|
||||
|
||||
#if SIZEOF_SIZE_T == SIZEOF_UNSIGNED_INT
|
||||
# define FFTW_MPI_SIZE_T MPI_UNSIGNED
|
||||
#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG
|
||||
# define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
|
||||
#elif SIZEOF_SIZE_T == SIZEOF_UNSIGNED_LONG_LONG
|
||||
# define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG_LONG
|
||||
#else
|
||||
# error MPI type for size_t is unknown
|
||||
# define FFTW_MPI_SIZE_T MPI_UNSIGNED_LONG
|
||||
#endif
|
||||
|
||||
/* Import wisdom from all processes to process 0, as prelude to
|
||||
exporting a single wisdom file (this is convenient when we are
|
||||
running on identical processors, to avoid the annoyance of having
|
||||
per-process wisdom files). In order to make the time for this
|
||||
operation logarithmic in the number of processors (rather than
|
||||
linear), we employ a tree reduction algorithm. This means that the
|
||||
wisdom is modified on processes other than root, which shouldn't
|
||||
matter in practice. */
|
||||
void XM(gather_wisdom)(MPI_Comm comm_)
|
||||
{
|
||||
MPI_Comm comm, comm2;
|
||||
int my_pe, n_pes;
|
||||
char *wis;
|
||||
size_t wislen;
|
||||
MPI_Status status;
|
||||
|
||||
MPI_Comm_dup(comm_, &comm);
|
||||
MPI_Comm_rank(comm, &my_pe);
|
||||
MPI_Comm_size(comm, &n_pes);
|
||||
|
||||
if (n_pes > 2) { /* recursively split into even/odd processes */
|
||||
MPI_Comm_split(comm, my_pe % 2, my_pe, &comm2);
|
||||
XM(gather_wisdom)(comm2);
|
||||
MPI_Comm_free(&comm2);
|
||||
}
|
||||
if (n_pes > 1 && my_pe < 2) { /* import process 1 -> 0 */
|
||||
if (my_pe == 1) {
|
||||
wis = X(export_wisdom_to_string)();
|
||||
wislen = strlen(wis) + 1;
|
||||
MPI_Send(&wislen, 1, FFTW_MPI_SIZE_T, 0, 111, comm);
|
||||
MPI_Send(wis, wislen, MPI_CHAR, 0, 222, comm);
|
||||
free(wis);
|
||||
}
|
||||
else /* my_pe == 0 */ {
|
||||
MPI_Recv(&wislen, 1, FFTW_MPI_SIZE_T, 1, 111, comm, &status);
|
||||
wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
|
||||
MPI_Recv(wis, wislen, MPI_CHAR, 1, 222, comm, &status);
|
||||
if (!X(import_wisdom_from_string)(wis))
|
||||
MPI_Abort(comm, 1);
|
||||
X(ifree)(wis);
|
||||
}
|
||||
}
|
||||
MPI_Comm_free(&comm);
|
||||
}
|
||||
|
||||
/* broadcast wisdom from process 0 to all other processes; this
|
||||
is useful so that we can import wisdom once and not worry
|
||||
about parallel I/O or process-specific wisdom, although of
|
||||
course it assumes that all the processes have identical
|
||||
performance characteristics (i.e. identical hardware). */
|
||||
void XM(broadcast_wisdom)(MPI_Comm comm_)
|
||||
{
|
||||
MPI_Comm comm;
|
||||
int my_pe;
|
||||
char *wis;
|
||||
size_t wislen;
|
||||
|
||||
MPI_Comm_dup(comm_, &comm);
|
||||
MPI_Comm_rank(comm, &my_pe);
|
||||
|
||||
if (my_pe != 0) {
|
||||
MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
|
||||
wis = (char *) MALLOC(wislen * sizeof(char), OTHER);
|
||||
MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
|
||||
if (!X(import_wisdom_from_string)(wis))
|
||||
MPI_Abort(comm, 1);
|
||||
X(ifree)(wis);
|
||||
}
|
||||
else /* my_pe == 0 */ {
|
||||
wis = X(export_wisdom_to_string)();
|
||||
wislen = strlen(wis) + 1;
|
||||
MPI_Bcast(&wislen, 1, FFTW_MPI_SIZE_T, 0, comm);
|
||||
MPI_Bcast(wis, wislen, MPI_CHAR, 0, comm);
|
||||
X(free)(wis);
|
||||
}
|
||||
MPI_Comm_free(&comm);
|
||||
}
|
||||
Reference in New Issue
Block a user