diff --git a/ompi/runtime/mpiruntime.h b/ompi/runtime/mpiruntime.h index 5d3b242a13..d33b0e46b4 100644 --- a/ompi/runtime/mpiruntime.h +++ b/ompi/runtime/mpiruntime.h @@ -110,6 +110,7 @@ extern "C" { * be made if they will be made). */ int ompi_init_do_preconnect(void); + int ompi_init_do_oob_preconnect(void); #if defined(c_plusplus) || defined(__cplusplus) } diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index f15327726b..80fa661cb3 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -661,6 +661,27 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) gettimeofday(&ompistart, NULL); } + /* wire up the oob interface, if requested. Do this here because + it will go much faster before the event library is switched + into non-blocking mode */ + if (ompi_mpi_preconnect_oob) { + if (OMPI_SUCCESS != (ret = ompi_init_do_oob_preconnect())) { + error = "ompi_mpi_do_preconnect_oob() failed"; + goto error; + } + } + + /* check for timing request - get stop time and report elapsed + time if so, then start the clock again */ + if (timing) { + gettimeofday(&ompistop, NULL); + opal_output(0, "ompi_mpi_init[%ld]: time from stage 2 cast to complete oob wireup %ld usec", + (long)ORTE_PROC_MY_NAME->vpid, + (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + + (ompistop.tv_usec - ompistart.tv_usec))); + gettimeofday(&ompistart, NULL); + } + #if OMPI_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take @@ -733,14 +754,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* At this point, we are fully configured and in MPI mode. Any communication calls here will work exactly like they would in the user's code. Setup the connections between procs and warm - them up with simple sends, if requested*/ + them up with simple sends, if requested */ if (ompi_mpi_preconnect_all) { if (OMPI_SUCCESS != (ret = ompi_init_do_preconnect())) { error = "ompi_mpi_do_preconnect_all() failed"; - /* This will loop back up above, but ret != OMPI_SUCCESS, - so we'll end up returning out of this function before - getting here (and therefore avoiding an infinite - loop) */ goto error; } } @@ -769,7 +786,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* check for timing request - get stop time and report elapsed time if so */ if (timing) { gettimeofday(&ompistop, NULL); - opal_output(0, "ompi_mpi_init[%ld]: time from stage2 xcast to complete mpi_init %ld usec", + opal_output(0, "ompi_mpi_init[%ld]: time from oob wireup to complete mpi_init %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index bd6b6ab099..877d7c0496 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -50,6 +50,7 @@ bool ompi_mpi_abort_print_stack = false; int ompi_mpi_abort_delay = 0; bool ompi_mpi_keep_peer_hostnames = true; bool ompi_mpi_preconnect_all = false; +bool ompi_mpi_preconnect_oob = false; bool ompi_mpi_leave_pinned = false; bool ompi_mpi_leave_pinned_pipeline = false; @@ -190,6 +191,13 @@ int ompi_mpi_register_params(void) ompi_mpi_preconnect_all = OPAL_INT_TO_BOOL(value); + mca_base_param_reg_int_name("mpi", "preconnect_oob", + "Whether to force MPI processes to fully wire-up the OOB system between MPI processes.", + false, false, + (int) ompi_mpi_preconnect_oob, &value); + + ompi_mpi_preconnect_oob = OPAL_INT_TO_BOOL(value); + /* Leave pinned parameter */ mca_base_param_reg_int_name("mpi", "leave_pinned", diff --git a/ompi/runtime/ompi_mpi_preconnect.c b/ompi/runtime/ompi_mpi_preconnect.c index ab8ae1b20f..bcf85adc59 100644 --- a/ompi/runtime/ompi_mpi_preconnect.c +++ b/ompi/runtime/ompi_mpi_preconnect.c @@ -20,6 +20,8 @@ #include "ompi/communicator/communicator.h" #include "ompi/request/request.h" #include "ompi/runtime/mpiruntime.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" /* * do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do @@ -80,4 +82,65 @@ int ompi_init_do_preconnect(void) return ret; } + + +int ompi_init_do_oob_preconnect(void) +{ + size_t world_size, i, next, prev, my_index; + ompi_proc_t **procs; + int ret; + struct iovec msg[1]; + + procs = ompi_proc_world(&world_size); + + msg[0].iov_base = NULL; + msg[0].iov_len = 0; + + if (world_size == 2) { + if (ompi_proc_local() == procs[0]) { + ret = orte_rml.send(&procs[1]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + } else { + ret = orte_rml.recv(&procs[0]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + } + } else if (world_size > 2) { + for (i = 0 ; i < world_size ; ++i) { + if (ompi_proc_local() == procs[i]) { + my_index = i; + break; + } + } + + for (i = 1 ; i <= world_size / 2 ; ++i) { + next = (my_index + i) % world_size; + prev = (my_index - i + world_size) % world_size; + + /* sends do not wait for a match */ + ret = orte_rml.send(&procs[next]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + + ret = orte_rml.recv(&procs[prev]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + } + } + + return OMPI_SUCCESS; +} diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 4d43098320..f090b7cc75 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -107,6 +107,11 @@ OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone; */ OMPI_DECLSPEC extern bool ompi_mpi_preconnect_all; + /** + * should we wireup the oob completely during MPI_INIT? + */ + OMPI_DECLSPEC extern bool ompi_mpi_preconnect_oob; + /** * Whether MPI_ABORT should print out an identifying message * (e.g., hostname and PID) and loop waiting for a debugger to diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 3dc9242c55..546ac98e63 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -229,7 +229,7 @@ int mca_oob_tcp_component_open(void) "connect() timeout in seconds, before trying next interface", false, false, - 600, + 10, &mca_oob_tcp_component.tcp_timeout); @@ -412,7 +412,7 @@ static int mca_oob_tcp_create_listen(void) mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port; /* setup listen backlog to maximum allowed by kernel */ - if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) { + if(listen(mca_oob_tcp_component.tcp_listen_sd, 5) < 0) { opal_output(0, "mca_oob_tcp_component_init: listen(): %s (%d)", strerror(opal_socket_errno), opal_socket_errno); return ORTE_ERROR; @@ -623,7 +623,7 @@ static int mca_oob_tcp_create_listen_thread(void) mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port; /* setup listen backlog to maximum allowed by kernel */ - if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) { + if(listen(mca_oob_tcp_component.tcp_listen_sd, 5) < 0) { opal_output(0, "mca_oob_tcp_component_init: listen() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno); return ORTE_ERROR; diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index e4e79f7193..5dbda6ea9f 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -54,6 +54,7 @@ typedef uint32_t orte_rml_tag_t; #define ORTE_RML_TAG_BPROC 17 #define ORTE_RML_TAG_BPROC_ABORT 18 #define ORTE_RML_TAG_SM_BACK_FILE_CREATED 19 +#define ORTE_RML_TAG_WIREUP 20 #define ORTE_RML_TAG_DYNAMIC 2000