From 51c5516815a8425320a5486161a55455d522b111 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 11 May 2006 19:46:21 +0000 Subject: [PATCH] Add a new MCA parameter: mpi_keep_peer_hostnames. If this is nonzero, (which is currently the default, although we may argue over this later :-) ), a new field in the ompi_proc_t named proc_hostname will have the string hostname of that peer. If 0, this field will be NULL. This allows for printing nicer error messages in environments where peer hostnames are not otherwise easily obtainable, such as the mvapi BTL (requested by Sandia, who has both a *huge* number of nodes and 6GB of RAM per node, so they don't care about the extra memory usage ;-) ). This commit was SVN r9902. --- ompi/proc/proc.c | 14 ++++++++++++++ ompi/proc/proc.h | 3 +++ ompi/runtime/ompi_mpi_params.c | 10 ++++++++++ ompi/runtime/params.h | 6 ++++++ 4 files changed, 33 insertions(+) diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index 76199d8c72..41592ef2ca 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +34,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/datatype/dt_arch.h" #include "ompi/datatype/convertor.h" +#include "ompi/runtime/params.h" static opal_list_t ompi_proc_list; static opal_mutex_t ompi_proc_lock; @@ -67,6 +69,10 @@ void ompi_proc_construct(ompi_proc_t* proc) proc->proc_flags = 0; + /* By default, put NULL in the hostname. It may or may not get + filled in later -- consumer of this field beware! */ + proc->proc_hostname = NULL; + OPAL_THREAD_LOCK(&ompi_proc_lock); opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc); OPAL_THREAD_UNLOCK(&ompi_proc_lock); @@ -84,6 +90,9 @@ void ompi_proc_destruct(ompi_proc_t* proc) * destroyed here. It will be destroyed later when the ompi_ddt_finalize is called. */ OBJ_RELEASE( proc->proc_convertor ); + if (NULL != proc->proc_hostname) { + free(proc->proc_hostname); + } OPAL_THREAD_LOCK(&ompi_proc_lock); opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc); OPAL_THREAD_UNLOCK(&ompi_proc_lock); @@ -535,6 +544,11 @@ static void callback(orte_gpr_notify_data_t *data, void *cbdata) proc->proc_convertor = ompi_convertor_create(proc->proc_arch, 0); } + /* Save the hostname */ + if (ompi_mpi_keep_peer_hostnames) { + proc->proc_hostname = str; + str = NULL; + } } } } diff --git a/ompi/proc/proc.h b/ompi/proc/proc.h index 998490537a..e61aacb87c 100644 --- a/ompi/proc/proc.h +++ b/ompi/proc/proc.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,6 +47,8 @@ struct ompi_proc_t { struct ompi_convertor_t* proc_convertor; /** process-wide lock */ opal_mutex_t proc_lock; + /** Keep the hostname around for debugging purposes */ + char *proc_hostname; /** flags for this proc */ uint8_t proc_flags; }; diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index 54d789a6e2..a0c5b77272 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -46,6 +46,8 @@ char *ompi_mpi_show_mca_params_file = NULL; bool ompi_mpi_paffinity_alone = false; bool ompi_mpi_abort_print_stack = false; int ompi_mpi_abort_delay = 0; +bool ompi_mpi_keep_peer_hostnames = true; + int ompi_mpi_register_params(void) { @@ -138,6 +140,14 @@ int ompi_mpi_register_params(void) true, false, -1, NULL); + /* Do we want to save hostnames for debugging messages? This can + eat quite a bit of memory... */ + + mca_base_param_reg_int_name("mpi", "keep_peer_hostnames", + "If nonzero, save the string hostnames of all MPI peer processes (mostly for error / debugging output messages). This can add quite a bit of memory usage to each MPI process.", + false, false, 1, &value); + ompi_mpi_keep_peer_hostnames = (bool) value; + /* MPI_ABORT controls */ mca_base_param_reg_int_name("mpi", "abort_delay", diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 9928f91bca..b395e0932b 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -89,6 +89,12 @@ OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file; */ OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone; + /** + * Whether we should keep the string hostnames of all the MPI + * process peers around or not (eats up a good bit of memory). + */ + OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames; + /** * Whether an MPI_ABORT should print out a stack trace or not. */