1
1

Allow the OOB to connect between all MPI applications during MPI_INIT

without also establishing MPI connectivity.

This commit was SVN r13593.
Этот коммит содержится в:
Brian Barrett 2007-02-09 20:11:40 +00:00
родитель 418f9ee5c8
Коммит 81472363ea
7 изменённых файлов: 104 добавлений и 9 удалений

Просмотреть файл

@ -110,6 +110,7 @@ extern "C" {
* be made if they will be made). * be made if they will be made).
*/ */
int ompi_init_do_preconnect(void); int ompi_init_do_preconnect(void);
int ompi_init_do_oob_preconnect(void);
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
} }

Просмотреть файл

@ -661,6 +661,27 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
gettimeofday(&ompistart, NULL); gettimeofday(&ompistart, NULL);
} }
/* wire up the oob interface, if requested. Do this here because
it will go much faster before the event library is switched
into non-blocking mode */
if (ompi_mpi_preconnect_oob) {
if (OMPI_SUCCESS != (ret = ompi_init_do_oob_preconnect())) {
error = "ompi_mpi_do_preconnect_oob() failed";
goto error;
}
}
/* check for timing request - get stop time and report elapsed
time if so, then start the clock again */
if (timing) {
gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time from stage 2 cast to complete oob wireup %ld usec",
(long)ORTE_PROC_MY_NAME->vpid,
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));
gettimeofday(&ompistart, NULL);
}
#if OMPI_ENABLE_PROGRESS_THREADS == 0 #if OMPI_ENABLE_PROGRESS_THREADS == 0
/* Start setting up the event engine for MPI operations. Don't /* Start setting up the event engine for MPI operations. Don't
block in the event library, so that communications don't take block in the event library, so that communications don't take
@ -733,14 +754,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* At this point, we are fully configured and in MPI mode. Any /* At this point, we are fully configured and in MPI mode. Any
communication calls here will work exactly like they would in communication calls here will work exactly like they would in
the user's code. Setup the connections between procs and warm the user's code. Setup the connections between procs and warm
them up with simple sends, if requested*/ them up with simple sends, if requested */
if (ompi_mpi_preconnect_all) { if (ompi_mpi_preconnect_all) {
if (OMPI_SUCCESS != (ret = ompi_init_do_preconnect())) { if (OMPI_SUCCESS != (ret = ompi_init_do_preconnect())) {
error = "ompi_mpi_do_preconnect_all() failed"; error = "ompi_mpi_do_preconnect_all() failed";
/* This will loop back up above, but ret != OMPI_SUCCESS,
so we'll end up returning out of this function before
getting here (and therefore avoiding an infinite
loop) */
goto error; goto error;
} }
} }
@ -769,7 +786,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* check for timing request - get stop time and report elapsed time if so */ /* check for timing request - get stop time and report elapsed time if so */
if (timing) { if (timing) {
gettimeofday(&ompistop, NULL); gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time from stage2 xcast to complete mpi_init %ld usec", opal_output(0, "ompi_mpi_init[%ld]: time from oob wireup to complete mpi_init %ld usec",
(long)ORTE_PROC_MY_NAME->vpid, (long)ORTE_PROC_MY_NAME->vpid,
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec))); (ompistop.tv_usec - ompistart.tv_usec)));

Просмотреть файл

@ -50,6 +50,7 @@ bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0; int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true; bool ompi_mpi_keep_peer_hostnames = true;
bool ompi_mpi_preconnect_all = false; bool ompi_mpi_preconnect_all = false;
bool ompi_mpi_preconnect_oob = false;
bool ompi_mpi_leave_pinned = false; bool ompi_mpi_leave_pinned = false;
bool ompi_mpi_leave_pinned_pipeline = false; bool ompi_mpi_leave_pinned_pipeline = false;
@ -190,6 +191,13 @@ int ompi_mpi_register_params(void)
ompi_mpi_preconnect_all = OPAL_INT_TO_BOOL(value); ompi_mpi_preconnect_all = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("mpi", "preconnect_oob",
"Whether to force MPI processes to fully wire-up the OOB system between MPI processes.",
false, false,
(int) ompi_mpi_preconnect_oob, &value);
ompi_mpi_preconnect_oob = OPAL_INT_TO_BOOL(value);
/* Leave pinned parameter */ /* Leave pinned parameter */
mca_base_param_reg_int_name("mpi", "leave_pinned", mca_base_param_reg_int_name("mpi", "leave_pinned",

Просмотреть файл

@ -20,6 +20,8 @@
#include "ompi/communicator/communicator.h" #include "ompi/communicator/communicator.h"
#include "ompi/request/request.h" #include "ompi/request/request.h"
#include "ompi/runtime/mpiruntime.h" #include "ompi/runtime/mpiruntime.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
/* /*
* do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do * do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do
@ -81,3 +83,64 @@ int ompi_init_do_preconnect(void)
return ret; return ret;
} }
int ompi_init_do_oob_preconnect(void)
{
size_t world_size, i, next, prev, my_index;
ompi_proc_t **procs;
int ret;
struct iovec msg[1];
procs = ompi_proc_world(&world_size);
msg[0].iov_base = NULL;
msg[0].iov_len = 0;
if (world_size == 2) {
if (ompi_proc_local() == procs[0]) {
ret = orte_rml.send(&procs[1]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
} else {
ret = orte_rml.recv(&procs[0]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
} else if (world_size > 2) {
for (i = 0 ; i < world_size ; ++i) {
if (ompi_proc_local() == procs[i]) {
my_index = i;
break;
}
}
for (i = 1 ; i <= world_size / 2 ; ++i) {
next = (my_index + i) % world_size;
prev = (my_index - i + world_size) % world_size;
/* sends do not wait for a match */
ret = orte_rml.send(&procs[next]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
ret = orte_rml.recv(&procs[prev]->proc_name,
msg,
1,
ORTE_RML_TAG_WIREUP,
0);
if (ret < 0) return ret;
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -107,6 +107,11 @@ OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
*/ */
OMPI_DECLSPEC extern bool ompi_mpi_preconnect_all; OMPI_DECLSPEC extern bool ompi_mpi_preconnect_all;
/**
* should we wireup the oob completely during MPI_INIT?
*/
OMPI_DECLSPEC extern bool ompi_mpi_preconnect_oob;
/** /**
* Whether MPI_ABORT should print out an identifying message * Whether MPI_ABORT should print out an identifying message
* (e.g., hostname and PID) and loop waiting for a debugger to * (e.g., hostname and PID) and loop waiting for a debugger to

Просмотреть файл

@ -229,7 +229,7 @@ int mca_oob_tcp_component_open(void)
"connect() timeout in seconds, before trying next interface", "connect() timeout in seconds, before trying next interface",
false, false,
false, false,
600, 10,
&mca_oob_tcp_component.tcp_timeout); &mca_oob_tcp_component.tcp_timeout);
@ -412,7 +412,7 @@ static int mca_oob_tcp_create_listen(void)
mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port; mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port;
/* setup listen backlog to maximum allowed by kernel */ /* setup listen backlog to maximum allowed by kernel */
if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) { if(listen(mca_oob_tcp_component.tcp_listen_sd, 5) < 0) {
opal_output(0, "mca_oob_tcp_component_init: listen(): %s (%d)", opal_output(0, "mca_oob_tcp_component_init: listen(): %s (%d)",
strerror(opal_socket_errno), opal_socket_errno); strerror(opal_socket_errno), opal_socket_errno);
return ORTE_ERROR; return ORTE_ERROR;
@ -623,7 +623,7 @@ static int mca_oob_tcp_create_listen_thread(void)
mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port; mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port;
/* setup listen backlog to maximum allowed by kernel */ /* setup listen backlog to maximum allowed by kernel */
if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) { if(listen(mca_oob_tcp_component.tcp_listen_sd, 5) < 0) {
opal_output(0, "mca_oob_tcp_component_init: listen() failed: %s (%d)", opal_output(0, "mca_oob_tcp_component_init: listen() failed: %s (%d)",
strerror(opal_socket_errno), opal_socket_errno); strerror(opal_socket_errno), opal_socket_errno);
return ORTE_ERROR; return ORTE_ERROR;

Просмотреть файл

@ -54,6 +54,7 @@ typedef uint32_t orte_rml_tag_t;
#define ORTE_RML_TAG_BPROC 17 #define ORTE_RML_TAG_BPROC 17
#define ORTE_RML_TAG_BPROC_ABORT 18 #define ORTE_RML_TAG_BPROC_ABORT 18
#define ORTE_RML_TAG_SM_BACK_FILE_CREATED 19 #define ORTE_RML_TAG_SM_BACK_FILE_CREATED 19
#define ORTE_RML_TAG_WIREUP 20
#define ORTE_RML_TAG_DYNAMIC 2000 #define ORTE_RML_TAG_DYNAMIC 2000