1
1
* Don't need the 2 process case -- we'll send an extra message, but
    at very little cost and less code is better.
  * Use COMPLETE sends instead of STANDARD sends so that the connection
    is fully established before we move on to the next connection.  The
    previous code was still causing minor connection flooding for huge
    numbers of processes.
  * mpi_preconnect_all now connects both OOB and MPI layers.  There's
    also mpi_preconnect_mpi and mpi_preconnect_oob should you want to
    be more specific.
  * Since we're only using the MCA parameters once at the beginning
    of time, no need for global constants.  Just do the quick param
    lookup right before the parameter is needed.  Save some of that
    global variable space for the next guy.

Fixes trac:963

This commit was SVN r14553.

The following Trac tickets were found above:
  Ticket 963 --> https://svn.open-mpi.org/trac/ompi/ticket/963
Этот коммит содержится в:
Brian Barrett 2007-05-01 04:49:36 +00:00
родитель e63346a633
Коммит a25ce44dc1
5 изменённых файлов: 176 добавлений и 190 удалений

Просмотреть файл

@ -105,8 +105,8 @@ extern "C" {
* Do a preconnect of MPI connections (i.e., force connections to
* be made if they will be made).
*/
int ompi_init_do_preconnect(void);
int ompi_init_do_oob_preconnect(void);
int ompi_init_preconnect_oob(void);
int ompi_init_preconnect_mpi(void);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -681,11 +681,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* wire up the oob interface, if requested. Do this here because
it will go much faster before the event library is switched
into non-blocking mode */
if (ompi_mpi_preconnect_oob) {
if (OMPI_SUCCESS != (ret = ompi_init_do_oob_preconnect())) {
error = "ompi_mpi_do_preconnect_oob() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = ompi_init_preconnect_oob())) {
error = "ompi_mpi_do_preconnect_oob() failed";
goto error;
}
/* check for timing request - get stop time and report elapsed
@ -708,6 +706,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
latency. */
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
/* wire up the mpi interface, if requested. Do this after the
non-block switch for non-TCP performance. Do before the
polling change as anyone with a complex wire-up is going to be
using the oob. */
if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
error = "ompi_mpi_do_preconnect_all() failed";
goto error;
}
/* Check whether we have been spawned or not. We introduce that
at the very end, since we need collectives, datatypes, ptls
@ -777,17 +784,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
if (value >= 0) {
opal_progress_set_event_poll_rate(value);
}
/* At this point, we are fully configured and in MPI mode. Any
communication calls here will work exactly like they would in
the user's code. Setup the connections between procs and warm
them up with simple sends, if requested */
if (ompi_mpi_preconnect_all) {
if (OMPI_SUCCESS != (ret = ompi_init_do_preconnect())) {
error = "ompi_mpi_do_preconnect_all() failed";
goto error;
}
}
error:
if (ret != OMPI_SUCCESS) {

Просмотреть файл

@ -51,8 +51,6 @@ bool ompi_mpi_paffinity_alone = false;
bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true;
bool ompi_mpi_preconnect_all = false;
bool ompi_mpi_preconnect_oob = false;
bool ompi_mpi_leave_pinned = false;
bool ompi_mpi_leave_pinned_pipeline = false;
@ -187,18 +185,23 @@ int ompi_mpi_register_params(void)
#endif
mca_base_param_reg_int_name("mpi", "preconnect_all",
"Whether to force MPI processes to create connections / warmup with *all* peers during MPI_INIT (vs. making connections lazily -- upon the first MPI traffic between each process peer pair)",
false, false,
(int) ompi_mpi_preconnect_all, &value);
ompi_mpi_preconnect_all = OPAL_INT_TO_BOOL(value);
"Whether to force MPI processes to create OOB "
"and MPI connections with *all* peers during "
"MPI_INIT (vs. making connections lazily -- "
"upon the first MPI traffic between each "
"process peer pair)",
false, false, 0, NULL);
mca_base_param_reg_int_name("mpi", "preconnect_mpi",
"Whether to force MPI processes to fully "
"wire-up the MPI connections between MPI "
"processes.",
false, false, 0, NULL);
mca_base_param_reg_int_name("mpi", "preconnect_oob",
"Whether to force MPI processes to fully wire-up the OOB system between MPI processes.",
false, false,
(int) ompi_mpi_preconnect_oob, &value);
ompi_mpi_preconnect_oob = OPAL_INT_TO_BOOL(value);
"Whether to force MPI processes to fully "
"wire-up the OOB system between MPI processes.",
false, false, 0, NULL);
/* Leave pinned parameter */

Просмотреть файл

@ -18,6 +18,7 @@
#include <stdlib.h>
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/communicator/communicator.h"
#include "ompi/request/request.h"
@ -25,126 +26,121 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
/*
* do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do
* a ping, not a ping pong)
*/
int ompi_init_do_preconnect(void)
int
ompi_init_preconnect_mpi(void)
{
int comm_size = ompi_comm_size(MPI_COMM_WORLD);
int my_rank = ompi_comm_rank(MPI_COMM_WORLD);
int i, ret;
int next,prev;
int comm_rank = ompi_comm_rank(MPI_COMM_WORLD);
int param, value, next, prev, i, ret = OMPI_SUCCESS;
struct ompi_request_t * requests[2];
ret = OMPI_SUCCESS;
if(comm_size == 2) {
if(my_rank){
ret = MCA_PML_CALL(send(MPI_BOTTOM, 0, MPI_BYTE,
0, 1,
MCA_PML_BASE_SEND_STANDARD,
MPI_COMM_WORLD));
if(OMPI_SUCCESS != ret) {
return ret;
}
} else {
ret = MCA_PML_CALL(recv(MPI_BOTTOM,0, MPI_BYTE, 1,
1, MPI_COMM_WORLD,
MPI_STATUS_IGNORE));
if(OMPI_SUCCESS != ret) {
return ret;
}
}
} else {
for (i = 1; i <= comm_size/2; ++i) {
next = (my_rank + i) % comm_size;
prev = (my_rank - i + comm_size) % comm_size;
ret = MCA_PML_CALL(irecv(MPI_BOTTOM,0, MPI_BYTE,
prev, 1,
MPI_COMM_WORLD,
&requests[0]));
if(OMPI_SUCCESS != ret) {
return ret;
}
ret = MCA_PML_CALL(isend(MPI_BOTTOM, 0, MPI_BYTE,
next, 1,
MCA_PML_BASE_SEND_STANDARD,
MPI_COMM_WORLD,
&requests[1]));
if (OMPI_SUCCESS != ret) {
return ret;
}
ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
char inbuf[1], outbuf[1];
param = mca_base_param_find("mpi", NULL, "preconnect_mpi");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
if (0 == value) {
param = mca_base_param_find("mpi", NULL, "preconnect_all");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
}
if (0 == value) return OMPI_SUCCESS;
inbuf[0] = outbuf[0] = '\0';
/* Each iteration, every process sends to its neighbor i hops to
the right and receives from its neighbor i hops to the left.
Because send_complete is used, there will only ever be one
outstanding send and one outstanding receive in the network at
a time for any given process. This limits any "flooding"
effect that can occur with other connection algorithms. While
the flooding algorithms may be a more efficient use of
resources, they can overwhelm the out-of-band connection system
used to wire up some networks, leading to poor performance and
hangs. */
for (i = 1 ; i <= comm_size / 2 ; ++i) {
next = (comm_rank + i) % comm_size;
prev = (comm_rank - i + comm_size) % comm_size;
ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR,
next, 1,
MCA_PML_BASE_SEND_COMPLETE,
MPI_COMM_WORLD,
&requests[1]));
if (OMPI_SUCCESS != ret) return ret;
ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR,
prev, 1,
MPI_COMM_WORLD,
&requests[0]));
if(OMPI_SUCCESS != ret) return ret;
ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
if (OMPI_SUCCESS != ret) return ret;
}
return ret;
}
int ompi_init_do_oob_preconnect(void)
int
ompi_init_preconnect_oob(void)
{
size_t world_size, next, prev, i, my_index;
size_t world_size, next, prev, i, world_rank;
ompi_proc_t **procs;
int ret;
struct iovec msg[1];
int ret, param, value = 0;
struct iovec inmsg[1], outmsg[1];
param = mca_base_param_find("mpi", NULL, "preconnect_oob");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
if (0 == value) {
param = mca_base_param_find("mpi", NULL, "preconnect_all");
if (OMPI_ERROR == param) return OMPI_SUCCESS;
ret = mca_base_param_lookup_int(param, &value);
if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
}
if (0 == value) return OMPI_SUCCESS;
procs = ompi_proc_world(&world_size);
msg[0].iov_base = NULL;
msg[0].iov_len = 0;
inmsg[0].iov_base = outmsg[0].iov_base = NULL;
inmsg[0].iov_len = outmsg[0].iov_len = 0;
if (world_size == 2) {
if (ompi_proc_local() == procs[0]) {
ret = orte_rml.send(&procs[1]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
} else {
ret = orte_rml.recv(&procs[0]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
} else if (world_size > 2) {
/* find local index in proc list (which *should* be our rank
in MCW, aborting if not found */
for (i = 0 ; i < world_size ; ++i) {
if (ompi_proc_local() == procs[i]) {
my_index = i;
goto preconnect_stage;
}
}
/* I'm not on the list ? well then just complain */
/* proc_world and ompi_comm_world should have the same proc list... */
if ((int) world_size != ompi_comm_size(MPI_COMM_WORLD)) {
return OMPI_ERR_NOT_FOUND;
preconnect_stage:
for (i = 1 ; i <= world_size / 2 ; ++i) {
next = (my_index + i) % world_size;
prev = (my_index - i + world_size) % world_size;
} else if (ompi_proc_local() !=
procs[ompi_comm_rank(MPI_COMM_WORLD)]) {
return OMPI_ERR_NOT_FOUND;
}
world_rank = (size_t) ompi_comm_rank(MPI_COMM_WORLD);
/* Each iteration, every process sends to its neighbor i hops to
the right and receives from its neighbor i hops to the left.
This limits any "flooding" effect that can occur with other
connection algorithms, which can overwhelm the out-of-band
connection system, leading to poor performance and hangs. */
for (i = 1 ; i <= world_size / 2 ; ++i) {
next = (world_rank + i) % world_size;
prev = (world_rank - i + world_size) % world_size;
/* sends do not wait for a match */
ret = orte_rml.send(&procs[next]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
/* sends do not wait for a match */
ret = orte_rml.send(&procs[next]->proc_name,
outmsg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
ret = orte_rml.recv(&procs[prev]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
ret = orte_rml.recv(&procs[prev]->proc_name,
inmsg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
return OMPI_SUCCESS;

Просмотреть файл

@ -21,9 +21,8 @@
#ifndef OMPI_RUNTIME_PARAMS_H
#define OMPI_RUNTIME_PARAMS_H
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
BEGIN_C_DECLS
/*
* Global variables
@ -91,71 +90,58 @@ OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
*/
OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
/**
* Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory).
*/
OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
/**
* Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory).
*/
OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
/**
* Whether an MPI_ABORT should print out a stack trace or not.
*/
OMPI_DECLSPEC extern bool ompi_mpi_abort_print_stack;
/**
* Whether an MPI_ABORT should print out a stack trace or not.
*/
OMPI_DECLSPEC extern bool ompi_mpi_abort_print_stack;
/**
* Whether we should force all connections to be created during
* MPI_INIT (vs. potentially making all the connection lazily upon
* first communication with an MPI peer process).
*/
OMPI_DECLSPEC extern bool ompi_mpi_preconnect_all;
/**
* Whether MPI_ABORT should print out an identifying message
* (e.g., hostname and PID) and loop waiting for a debugger to
* attach. The value of the integer is how many seconds to wait:
*
* 0 = do not print the message and do not loop
* negative value = print the message and loop forever
* positive value = print the message and delay for that many seconds
*/
OMPI_DECLSPEC extern int ompi_mpi_abort_delay;
/**
* should we wireup the oob completely during MPI_INIT?
*/
OMPI_DECLSPEC extern bool ompi_mpi_preconnect_oob;
/**
* Whether to use the "leave pinned" protocol or not.
*/
OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned;
/**
* Whether MPI_ABORT should print out an identifying message
* (e.g., hostname and PID) and loop waiting for a debugger to
* attach. The value of the integer is how many seconds to wait:
*
* 0 = do not print the message and do not loop
* negative value = print the message and loop forever
* positive value = print the message and delay for that many seconds
*/
OMPI_DECLSPEC extern int ompi_mpi_abort_delay;
/**
* Whether to use the "leave pinned pipeline" protocol or not.
*/
OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned_pipeline;
/**
* Whether to use the "leave pinned" protocol or not.
*/
OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned;
/**
* Whether to use the "leave pinned pipeline" protocol or not.
*/
OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned_pipeline;
/**
* Register MCA parameters used by the MPI layer.
*
* @returns OMPI_SUCCESS
*
* Registers several MCA parameters and initializes corresponding
* global variables to the values obtained from the MCA system.
*/
/**
* Register MCA parameters used by the MPI layer.
*
* @returns OMPI_SUCCESS
*
* Registers several MCA parameters and initializes corresponding
* global variables to the values obtained from the MCA system.
*/
OMPI_DECLSPEC int ompi_mpi_register_params(void);
/**
* Display all MCA parameters used
*
* @returns OMPI_SUCCESS
*
* Displays in key = value format
*/
int ompi_show_all_mca_params(int32_t, int, char *);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
/**
* Display all MCA parameters used
*
* @returns OMPI_SUCCESS
*
* Displays in key = value format
*/
int ompi_show_all_mca_params(int32_t, int, char *);
END_C_DECLS
#endif /* OMPI_RUNTIME_PARAMS_H */