1
1
openmpi/ompi/runtime/ompi_mpi_preconnect.c
Jeff Squyres 3bc940ac27 Fix three things from r15474 (thanks to Brian for noticing):
* bml.h had a change that introduced a variable named "_order" to
   avoid a conflict with a local variable.  The namespace starting
   with _ belongs to the os/compiler/kernel/not us.  So we can't start
   symbols with _.  So I replaced it with arg_order, and also updated
   the threaded equivalent of the macro that was modified.
 * in btl_openib_proc.c, one opal_output accidentally had its string
   reverted from "ompi_modex_recv..." to
   "mca_pml_base_modex_recv....".  This was fixed.
 * The change to ompi/runtime/ompi_preconnect.c was entirely
   reverted; it was an artifact of debugging.

This commit was SVN r15475.

The following SVN revision numbers were found above:
  r15474 --> open-mpi/ompi@8ace07efed
2007-07-18 11:38:06 +00:00

159 строки
5.8 KiB
C

/*
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdlib.h>
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/communicator/communicator.h"
#include "ompi/request/request.h"
#include "ompi/runtime/mpiruntime.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
int
ompi_init_preconnect_mpi(void)
{
int comm_size = ompi_comm_size(MPI_COMM_WORLD);
int comm_rank = ompi_comm_rank(MPI_COMM_WORLD);
int param, value, next, prev, i, ret = OMPI_SUCCESS;
struct ompi_request_t * requests[2];
char inbuf[1], outbuf[1];
param = mca_base_param_find("mpi", NULL, "preconnect_mpi");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
if (0 == value) {
param = mca_base_param_find("mpi", NULL, "preconnect_all");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
}
if (0 == value) return OMPI_SUCCESS;
inbuf[0] = outbuf[0] = '\0';
/* Each iteration, every process sends to its neighbor i hops to
the right and receives from its neighbor i hops to the left.
Because send_complete is used, there will only ever be one
outstanding send and one outstanding receive in the network at
a time for any given process. This limits any "flooding"
effect that can occur with other connection algorithms. While
the flooding algorithms may be a more efficient use of
resources, they can overwhelm the out-of-band connection system
used to wire up some networks, leading to poor performance and
hangs. */
for (i = 1 ; i <= comm_size / 2 ; ++i) {
next = (comm_rank + i) % comm_size;
prev = (comm_rank - i + comm_size) % comm_size;
ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR,
next, 1,
MCA_PML_BASE_SEND_COMPLETE,
MPI_COMM_WORLD,
&requests[1]));
if (OMPI_SUCCESS != ret) return ret;
ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR,
prev, 1,
MPI_COMM_WORLD,
&requests[0]));
if(OMPI_SUCCESS != ret) return ret;
ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
if (OMPI_SUCCESS != ret) return ret;
}
return ret;
}
int
ompi_init_preconnect_oob(void)
{
size_t world_size, next, prev, i, j, world_rank;
ompi_proc_t **procs;
int ret, simultaneous, param, value = 0;
struct iovec inmsg[1], outmsg[1];
param = mca_base_param_find("mpi", NULL, "preconnect_oob");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
if (0 == value) {
param = mca_base_param_find("mpi", NULL, "preconnect_all");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
}
if (0 == value) return OMPI_SUCCESS;
param = mca_base_param_find("mpi", NULL, "preconnect_oob_simultaneous");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
simultaneous = (value < 1) ? 1 : value;
procs = ompi_proc_world(&world_size);
inmsg[0].iov_base = outmsg[0].iov_base = NULL;
inmsg[0].iov_len = outmsg[0].iov_len = 0;
/* proc_world and ompi_comm_world should have the same proc list... */
if ((int) world_size != ompi_comm_size(MPI_COMM_WORLD)) {
return OMPI_ERR_NOT_FOUND;
} else if (ompi_proc_local() !=
procs[ompi_comm_rank(MPI_COMM_WORLD)]) {
return OMPI_ERR_NOT_FOUND;
}
world_rank = (size_t) ompi_comm_rank(MPI_COMM_WORLD);
/* Each iteration, every process sends to its neighbor i hops to
the right and receives from its neighbor i hops to the left.
This limits any "flooding" effect that can occur with other
connection algorithms, which can overwhelm the out-of-band
connection system, leading to poor performance and hangs. */
for (i = 1 ; i <= world_size / 2 ; i += simultaneous) {
for (j = 0 ; j < (size_t) simultaneous ; ++j) {
next = (world_rank + (i + j )) % world_size;
/* sends do not wait for a match */
ret = orte_rml.send(&procs[next]->proc_name,
outmsg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
for (j = 0 ; j < (size_t) simultaneous ; ++j) {
prev = (world_rank - (i + j) + world_size) % world_size;
ret = orte_rml.recv(&procs[prev]->proc_name,
inmsg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
}
return OMPI_SUCCESS;
}