From 8b28e5b33d19b659a8d7572f62731a126169b06d Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 9 Feb 2007 20:17:37 +0000 Subject: [PATCH] Allow the OOB to connect between all MPI applications during MPI_INIT without also establishing MPI connectivity. This commit was SVN r13595. --- ompi/runtime/mpiruntime.h | 3 ++ ompi/runtime/ompi_mpi_init.c | 31 ++++++++++---- ompi/runtime/ompi_mpi_params.c | 10 +++++ ompi/runtime/ompi_mpi_preconnect.c | 65 ++++++++++++++++++++++++++++++ ompi/runtime/params.h | 7 ++++ orte/mca/rml/rml_types.h | 3 ++ 6 files changed, 112 insertions(+), 7 deletions(-) diff --git a/ompi/runtime/mpiruntime.h b/ompi/runtime/mpiruntime.h index 5d3b242a13..ab01e3efa5 100644 --- a/ompi/runtime/mpiruntime.h +++ b/ompi/runtime/mpiruntime.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -110,6 +112,7 @@ extern "C" { * be made if they will be made). */ int ompi_init_do_preconnect(void); + int ompi_init_do_oob_preconnect(void); #if defined(c_plusplus) || defined(__cplusplus) } diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index f15327726b..0e01af9ea5 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006 University of Houston. All rights reserved. * @@ -661,6 +661,27 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) gettimeofday(&ompistart, NULL); } + /* wire up the oob interface, if requested. Do this here because + it will go much faster before the event library is switched + into non-blocking mode */ + if (ompi_mpi_preconnect_oob) { + if (OMPI_SUCCESS != (ret = ompi_init_do_oob_preconnect())) { + error = "ompi_mpi_do_preconnect_oob() failed"; + goto error; + } + } + + /* check for timing request - get stop time and report elapsed + time if so, then start the clock again */ + if (timing) { + gettimeofday(&ompistop, NULL); + opal_output(0, "ompi_mpi_init[%ld]: time from stage 2 cast to complete oob wireup %ld usec", + (long)ORTE_PROC_MY_NAME->vpid, + (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + + (ompistop.tv_usec - ompistart.tv_usec))); + gettimeofday(&ompistart, NULL); + } + #if OMPI_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take @@ -733,14 +754,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* At this point, we are fully configured and in MPI mode. Any communication calls here will work exactly like they would in the user's code. Setup the connections between procs and warm - them up with simple sends, if requested*/ + them up with simple sends, if requested */ if (ompi_mpi_preconnect_all) { if (OMPI_SUCCESS != (ret = ompi_init_do_preconnect())) { error = "ompi_mpi_do_preconnect_all() failed"; - /* This will loop back up above, but ret != OMPI_SUCCESS, - so we'll end up returning out of this function before - getting here (and therefore avoiding an infinite - loop) */ goto error; } } @@ -769,7 +786,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* check for timing request - get stop time and report elapsed time if so */ if (timing) { gettimeofday(&ompistop, NULL); - opal_output(0, "ompi_mpi_init[%ld]: time from stage2 xcast to complete mpi_init %ld usec", + opal_output(0, "ompi_mpi_init[%ld]: time from oob wireup to complete mpi_init %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index bd6b6ab099..1f59d1cbd0 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,6 +52,7 @@ bool ompi_mpi_abort_print_stack = false; int ompi_mpi_abort_delay = 0; bool ompi_mpi_keep_peer_hostnames = true; bool ompi_mpi_preconnect_all = false; +bool ompi_mpi_preconnect_oob = false; bool ompi_mpi_leave_pinned = false; bool ompi_mpi_leave_pinned_pipeline = false; @@ -190,6 +193,13 @@ int ompi_mpi_register_params(void) ompi_mpi_preconnect_all = OPAL_INT_TO_BOOL(value); + mca_base_param_reg_int_name("mpi", "preconnect_oob", + "Whether to force MPI processes to fully wire-up the OOB system between MPI processes.", + false, false, + (int) ompi_mpi_preconnect_oob, &value); + + ompi_mpi_preconnect_oob = OPAL_INT_TO_BOOL(value); + /* Leave pinned parameter */ mca_base_param_reg_int_name("mpi", "leave_pinned", diff --git a/ompi/runtime/ompi_mpi_preconnect.c b/ompi/runtime/ompi_mpi_preconnect.c index ab8ae1b20f..35159e898c 100644 --- a/ompi/runtime/ompi_mpi_preconnect.c +++ b/ompi/runtime/ompi_mpi_preconnect.c @@ -5,6 +5,8 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,6 +22,8 @@ #include "ompi/communicator/communicator.h" #include "ompi/request/request.h" #include "ompi/runtime/mpiruntime.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" /* * do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do @@ -80,4 +84,65 @@ int ompi_init_do_preconnect(void) return ret; } + + +int ompi_init_do_oob_preconnect(void) +{ + size_t world_size, i, next, prev, my_index; + ompi_proc_t **procs; + int ret; + struct iovec msg[1]; + + procs = ompi_proc_world(&world_size); + + msg[0].iov_base = NULL; + msg[0].iov_len = 0; + + if (world_size == 2) { + if (ompi_proc_local() == procs[0]) { + ret = orte_rml.send(&procs[1]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + } else { + ret = orte_rml.recv(&procs[0]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + } + } else if (world_size > 2) { + for (i = 0 ; i < world_size ; ++i) { + if (ompi_proc_local() == procs[i]) { + my_index = i; + break; + } + } + + for (i = 1 ; i <= world_size / 2 ; ++i) { + next = (my_index + i) % world_size; + prev = (my_index - i + world_size) % world_size; + + /* sends do not wait for a match */ + ret = orte_rml.send(&procs[next]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + + ret = orte_rml.recv(&procs[prev]->proc_name, + msg, + 1, + ORTE_RML_TAG_WIREUP, + 0); + if (ret < 0) return ret; + } + } + + return OMPI_SUCCESS; +} diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 4d43098320..1a32d65f85 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * @@ -107,6 +109,11 @@ OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone; */ OMPI_DECLSPEC extern bool ompi_mpi_preconnect_all; + /** + * should we wireup the oob completely during MPI_INIT? + */ + OMPI_DECLSPEC extern bool ompi_mpi_preconnect_oob; + /** * Whether MPI_ABORT should print out an identifying message * (e.g., hostname and PID) and loop waiting for a debugger to diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index e4e79f7193..fd08e56f82 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,6 +56,7 @@ typedef uint32_t orte_rml_tag_t; #define ORTE_RML_TAG_BPROC 17 #define ORTE_RML_TAG_BPROC_ABORT 18 #define ORTE_RML_TAG_SM_BACK_FILE_CREATED 19 +#define ORTE_RML_TAG_WIREUP 20 #define ORTE_RML_TAG_DYNAMIC 2000