1
1
openmpi/ompi/runtime/ompi_mpi_preconnect.c
Brian Barrett a25ce44dc1 Clean up the preconnect code:
* Don't need the 2 process case -- we'll send an extra message, but
    at very little cost and less code is better.
  * Use COMPLETE sends instead of STANDARD sends so that the connection
    is fully established before we move on to the next connection.  The
    previous code was still causing minor connection flooding for huge
    numbers of processes.
  * mpi_preconnect_all now connects both OOB and MPI layers.  There's
    also mpi_preconnect_mpi and mpi_preconnect_oob should you want to
    be more specific.
  * Since we're only using the MCA parameters once at the beginning
    of time, no need for global constants.  Just do the quick param
    lookup right before the parameter is needed.  Save some of that
    global variable space for the next guy.

Fixes trac:963

This commit was SVN r14553.

The following Trac tickets were found above:
  Ticket 963 --> https://svn.open-mpi.org/trac/ompi/ticket/963
2007-05-01 04:49:36 +00:00

149 строки
5.3 KiB
C

/*
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdlib.h>
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/communicator/communicator.h"
#include "ompi/request/request.h"
#include "ompi/runtime/mpiruntime.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
int
ompi_init_preconnect_mpi(void)
{
int comm_size = ompi_comm_size(MPI_COMM_WORLD);
int comm_rank = ompi_comm_rank(MPI_COMM_WORLD);
int param, value, next, prev, i, ret = OMPI_SUCCESS;
struct ompi_request_t * requests[2];
char inbuf[1], outbuf[1];
param = mca_base_param_find("mpi", NULL, "preconnect_mpi");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
if (0 == value) {
param = mca_base_param_find("mpi", NULL, "preconnect_all");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
}
if (0 == value) return OMPI_SUCCESS;
inbuf[0] = outbuf[0] = '\0';
/* Each iteration, every process sends to its neighbor i hops to
the right and receives from its neighbor i hops to the left.
Because send_complete is used, there will only ever be one
outstanding send and one outstanding receive in the network at
a time for any given process. This limits any "flooding"
effect that can occur with other connection algorithms. While
the flooding algorithms may be a more efficient use of
resources, they can overwhelm the out-of-band connection system
used to wire up some networks, leading to poor performance and
hangs. */
for (i = 1 ; i <= comm_size / 2 ; ++i) {
next = (comm_rank + i) % comm_size;
prev = (comm_rank - i + comm_size) % comm_size;
ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR,
next, 1,
MCA_PML_BASE_SEND_COMPLETE,
MPI_COMM_WORLD,
&requests[1]));
if (OMPI_SUCCESS != ret) return ret;
ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR,
prev, 1,
MPI_COMM_WORLD,
&requests[0]));
if(OMPI_SUCCESS != ret) return ret;
ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
if (OMPI_SUCCESS != ret) return ret;
}
return ret;
}
int
ompi_init_preconnect_oob(void)
{
size_t world_size, next, prev, i, world_rank;
ompi_proc_t **procs;
int ret, param, value = 0;
struct iovec inmsg[1], outmsg[1];
param = mca_base_param_find("mpi", NULL, "preconnect_oob");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
if (0 == value) {
param = mca_base_param_find("mpi", NULL, "preconnect_all");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
}
if (0 == value) return OMPI_SUCCESS;
procs = ompi_proc_world(&world_size);
inmsg[0].iov_base = outmsg[0].iov_base = NULL;
inmsg[0].iov_len = outmsg[0].iov_len = 0;
/* proc_world and ompi_comm_world should have the same proc list... */
if ((int) world_size != ompi_comm_size(MPI_COMM_WORLD)) {
return OMPI_ERR_NOT_FOUND;
} else if (ompi_proc_local() !=
procs[ompi_comm_rank(MPI_COMM_WORLD)]) {
return OMPI_ERR_NOT_FOUND;
}
world_rank = (size_t) ompi_comm_rank(MPI_COMM_WORLD);
/* Each iteration, every process sends to its neighbor i hops to
the right and receives from its neighbor i hops to the left.
This limits any "flooding" effect that can occur with other
connection algorithms, which can overwhelm the out-of-band
connection system, leading to poor performance and hangs. */
for (i = 1 ; i <= world_size / 2 ; ++i) {
next = (world_rank + i) % world_size;
prev = (world_rank - i + world_size) % world_size;
/* sends do not wait for a match */
ret = orte_rml.send(&procs[next]->proc_name,
outmsg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
ret = orte_rml.recv(&procs[prev]->proc_name,
inmsg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
return OMPI_SUCCESS;
}