1
1

Add a new MCA parameter: mpi_keep_peer_hostnames. If this is nonzero,

(which is currently the default, although we may argue over this later
:-) ), a new field in the ompi_proc_t named proc_hostname will have
the string hostname of that peer.  If 0, this field will be NULL.

This allows for printing nicer error messages in environments where
peer hostnames are not otherwise easily obtainable, such as the mvapi
BTL (requested by Sandia, who has both a *huge* number of nodes and
6GB of RAM per node, so they don't care about the extra memory usage
;-) ).

This commit was SVN r9902.
Этот коммит содержится в:
Jeff Squyres 2006-05-11 19:46:21 +00:00
родитель fd8fe94e6f
Коммит 51c5516815
4 изменённых файлов: 33 добавлений и 0 удалений

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California. * Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -33,6 +34,7 @@
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "ompi/datatype/dt_arch.h" #include "ompi/datatype/dt_arch.h"
#include "ompi/datatype/convertor.h" #include "ompi/datatype/convertor.h"
#include "ompi/runtime/params.h"
static opal_list_t ompi_proc_list; static opal_list_t ompi_proc_list;
static opal_mutex_t ompi_proc_lock; static opal_mutex_t ompi_proc_lock;
@ -67,6 +69,10 @@ void ompi_proc_construct(ompi_proc_t* proc)
proc->proc_flags = 0; proc->proc_flags = 0;
/* By default, put NULL in the hostname. It may or may not get
filled in later -- consumer of this field beware! */
proc->proc_hostname = NULL;
OPAL_THREAD_LOCK(&ompi_proc_lock); OPAL_THREAD_LOCK(&ompi_proc_lock);
opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc); opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc);
OPAL_THREAD_UNLOCK(&ompi_proc_lock); OPAL_THREAD_UNLOCK(&ompi_proc_lock);
@ -84,6 +90,9 @@ void ompi_proc_destruct(ompi_proc_t* proc)
* destroyed here. It will be destroyed later when the ompi_ddt_finalize is called. * destroyed here. It will be destroyed later when the ompi_ddt_finalize is called.
*/ */
OBJ_RELEASE( proc->proc_convertor ); OBJ_RELEASE( proc->proc_convertor );
if (NULL != proc->proc_hostname) {
free(proc->proc_hostname);
}
OPAL_THREAD_LOCK(&ompi_proc_lock); OPAL_THREAD_LOCK(&ompi_proc_lock);
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc); opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
OPAL_THREAD_UNLOCK(&ompi_proc_lock); OPAL_THREAD_UNLOCK(&ompi_proc_lock);
@ -535,6 +544,11 @@ static void callback(orte_gpr_notify_data_t *data, void *cbdata)
proc->proc_convertor = ompi_convertor_create(proc->proc_arch, 0); proc->proc_convertor = ompi_convertor_create(proc->proc_arch, 0);
} }
/* Save the hostname */
if (ompi_mpi_keep_peer_hostnames) {
proc->proc_hostname = str;
str = NULL;
}
} }
} }
} }

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -46,6 +47,8 @@ struct ompi_proc_t {
struct ompi_convertor_t* proc_convertor; struct ompi_convertor_t* proc_convertor;
/** process-wide lock */ /** process-wide lock */
opal_mutex_t proc_lock; opal_mutex_t proc_lock;
/** Keep the hostname around for debugging purposes */
char *proc_hostname;
/** flags for this proc */ /** flags for this proc */
uint8_t proc_flags; uint8_t proc_flags;
}; };

Просмотреть файл

@ -46,6 +46,8 @@ char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_paffinity_alone = false; bool ompi_mpi_paffinity_alone = false;
bool ompi_mpi_abort_print_stack = false; bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0; int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true;
int ompi_mpi_register_params(void) int ompi_mpi_register_params(void)
{ {
@ -138,6 +140,14 @@ int ompi_mpi_register_params(void)
true, false, true, false,
-1, NULL); -1, NULL);
/* Do we want to save hostnames for debugging messages? This can
eat quite a bit of memory... */
mca_base_param_reg_int_name("mpi", "keep_peer_hostnames",
"If nonzero, save the string hostnames of all MPI peer processes (mostly for error / debugging output messages). This can add quite a bit of memory usage to each MPI process.",
false, false, 1, &value);
ompi_mpi_keep_peer_hostnames = (bool) value;
/* MPI_ABORT controls */ /* MPI_ABORT controls */
mca_base_param_reg_int_name("mpi", "abort_delay", mca_base_param_reg_int_name("mpi", "abort_delay",

Просмотреть файл

@ -89,6 +89,12 @@ OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
*/ */
OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone; OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
/**
* Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory).
*/
OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
/** /**
* Whether an MPI_ABORT should print out a stack trace or not. * Whether an MPI_ABORT should print out a stack trace or not.
*/ */