1
1

ORTE-level MCA params are defined in several places. Ompi_info cannot call orte_init due to an issue with the memory allocator, thus making it impossible for ompi_info to display all of the ORTE-level MCA params.

By consolidating them all into one function, ompi_info can call that function and register the desired variables. This also requires, however, that ompi_info call orte_output_init to avoid generating tons of error messages, so make that adjustment too. 

Fixes ticket #1314

In addition, orte_output has a race condition issue whereby calls to orte_output/verbose can occur prior to either the RML being defined/setup, or the HNP being defined. This latter occurs during the initialization of the orte_process_info structure. In both cases, there is no way orte_output can send the output to the HNP. Hence, the message must be simply output locally.

Fixes ticket #1315

This commit was SVN r18524.
Этот коммит содержится в:
Ralph Castain 2008-05-28 13:29:58 +00:00
родитель 879a9fe45c
Коммит 828ae26d90
9 изменённых файлов: 193 добавлений и 157 удалений

Просмотреть файл

@ -114,6 +114,7 @@
#endif
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
#include "orte/util/output.h"
using namespace std;
using namespace ompi_info;
@ -189,6 +190,9 @@ void ompi_info::open_components()
orte_register_params();
// Initialize the orte_output system
orte_output_init();
// Register the MPI layer's MCA parameters
ompi_mpi_register_params();

Просмотреть файл

@ -221,7 +221,6 @@ int main(int argc, char *argv[])
ompi_info::mca_types.push_back("rml");
ompi_info::mca_types.push_back("routed");
ompi_info::mca_types.push_back("plm");
ompi_info::mca_types.push_back("sds");
#if OPAL_ENABLE_FT == 1
ompi_info::mca_types.push_back("snapc");
#endif

Просмотреть файл

@ -47,4 +47,6 @@ libopen_rte_la_SOURCES += \
runtime/orte_wakeup.c \
runtime/orte_locks.c \
runtime/orte_cr.c \
runtime/orte_data_server.c
runtime/orte_data_server.c \
runtime/orte_mca_params.c

Просмотреть файл

@ -52,6 +52,10 @@ bool orte_do_not_launch = false;
bool orted_spin_flag = false;
bool orte_static_ports = false;
bool orte_keep_fqdn_hostnames = false;
bool orte_help_want_aggregate = true;
bool orte_help_show_recursions;
bool orte_params_set = false;
int orte_debug_verbosity;
int32_t orte_contiguous_nodes;
int orte_debug_output = -1;
@ -81,45 +85,13 @@ opal_pointer_array_t *orte_node_pool;
bool orte_initialized = false;
bool orte_finalizing = false;
/* whether we have registered params or not */
static bool params_set = false;
int orte_register_params(void)
int orte_dt_init(void)
{
int value;
int orte_debug_verbosity;
if (params_set) {
return ORTE_SUCCESS;
}
int rc;
opal_data_type_t tmp;
/* set default output */
orte_debug_output = orte_output_open(NULL, "ORTE", "DEBUG", NULL);
mca_base_param_reg_int_name("orte", "debug",
"Top-level ORTE debug switch (default verbosity: 1)",
false, false, (int)false, &value);
orte_debug_flag = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "debug_verbose",
"Verbosity level for ORTE debug messages (default: 1)",
false, false, -1, &orte_debug_verbosity);
mca_base_param_reg_int_name("orte", "debug_daemons",
"Whether to debug the ORTE daemons or not",
false, false, (int)false, &value);
orte_debug_daemons_flag = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "debug_daemons_file",
"Whether want stdout/stderr of daemons to go to a file or not",
false, false, (int)false, &value);
orte_debug_daemons_file_flag = OPAL_INT_TO_BOOL(value);
/* If --debug-daemons-file was specified, that also implies
--debug-daemons */
if (orte_debug_daemons_file_flag) {
orte_debug_daemons_flag = true;
}
/* open up the verbose output for ORTE debugging */
if (orte_debug_flag || 0 < orte_debug_verbosity ||
(orte_debug_daemons_flag && (orte_process_info.daemon || orte_process_info.hnp))) {
@ -130,71 +102,6 @@ int orte_register_params(void)
}
}
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);
orte_do_not_launch = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orted", "spin",
"Have any orteds spin until we can connect a debugger to them",
false, false, (int)false, &value);
orted_spin_flag = OPAL_INT_TO_BOOL(value);
/* check for timing requests */
mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, (int)false, &value);
orte_timing = OPAL_INT_TO_BOOL(value);
/* User-level debugger info string */
mca_base_param_reg_string_name("orte", "base_user_debugger",
"Sequence of user-level debuggers to search for in orterun",
false, false, "totalview @mpirun@ -a @mpirun_args@ : ddt -n @np@ -start @executable@ @executable_argv@ @single_app@ : fxp @mpirun@ -a @mpirun_args@", NULL);
mca_base_param_reg_int_name("orte", "abort_timeout",
"Max time to wait [in secs] before aborting an ORTE operation (default: 1sec)",
false, false, 1, &value);
orte_max_timeout = 1000000.0 * value; /* convert to usec */
mca_base_param_reg_int_name("orte", "timeout_step",
"Time to wait [in usecs/proc] before aborting an ORTE operation (default: 100 usec/proc)",
false, false, 100, &orte_timeout_usec_per_proc);
/* default hostfile */
mca_base_param_reg_string_name("orte", "default_hostfile",
"Name of the default hostfile (relative or absolute path)",
false, false, NULL, &orte_default_hostfile);
/* whether or not to keep FQDN hostnames */
mca_base_param_reg_int_name("orte", "keep_fqdn_hostnames",
"Whether or not to keep FQDN hostnames [default: no]",
false, false, (int)false, &value);
orte_keep_fqdn_hostnames = OPAL_INT_TO_BOOL(value);
/* whether or not static ports exist */
mca_base_param_reg_int_name("orte", "static_ports",
"Whether or not static ports are in use [default: no]",
false, false, (int)false, &value);
orte_static_ports = OPAL_INT_TO_BOOL(value);
/* whether or not contiguous nodenames are in use */
mca_base_param_reg_int_name("orte", "contiguous_nodes",
"Number of nodes after which contiguous nodenames will be used [default: INT_MAX]",
false, false, INT32_MAX, &orte_contiguous_nodes);
/* All done */
params_set = true;
return ORTE_SUCCESS;
}
int orte_dt_init(void)
{
int rc;
opal_data_type_t tmp;
/** register the base system types with the DSS */
tmp = ORTE_STD_CNTR;
if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_std_cntr,

Просмотреть файл

@ -319,6 +319,10 @@ ORTE_DECLSPEC extern bool orte_static_ports;
ORTE_DECLSPEC extern int32_t orte_contiguous_nodes;
ORTE_DECLSPEC extern int orte_debug_output;
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern bool orte_help_want_aggregate;
ORTE_DECLSPEC extern bool orte_help_show_recursions;
ORTE_DECLSPEC extern bool orte_params_set;
ORTE_DECLSPEC extern int orte_debug_verbosity;
ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;

Просмотреть файл

@ -69,6 +69,15 @@ int orte_init(char flags)
return ret;
}
/* ensure we know the tool setting for when we finalize */
if ((flags & ORTE_TOOL) || (flags & ORTE_TOOL_WITH_NAME)) {
orte_process_info.tool = true;
}
if (orte_process_info.hnp) {
orte_process_info.daemon = false;
}
/* setup the orte_output system */
if (ORTE_SUCCESS != (ret = orte_output_init())) {
ORTE_ERROR_LOG(ret);
@ -91,29 +100,23 @@ int orte_init(char flags)
goto error;
}
/* Ensure the process info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
error = "orte_proc_info";
goto error;
}
/* ensure we know the tool setting for when we finalize */
if ((flags & ORTE_TOOL) || (flags & ORTE_TOOL_WITH_NAME)) {
orte_process_info.tool = true;
}
/* Initialize the ORTE data type support */
if (ORTE_SUCCESS != (ret = orte_dt_init())) {
error = "orte_dt_init";
goto error;
}
/* Ensure the rest of the process info structure is initialized */
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
error = "orte_proc_info";
goto error;
}
/* if I'm the HNP, make sure that the daemon flag is NOT set so that
* components unique to non-HNP orteds can be selected and init
* my basic storage elements
*/
if (orte_process_info.hnp) {
orte_process_info.daemon = false;
if (ORTE_SUCCESS != (ret = orte_hnp_globals_init())) {
error = "orte_hnp_globals_init";
goto error;

139
orte/runtime/orte_mca_params.c Обычный файл
Просмотреть файл

@ -0,0 +1,139 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
int orte_register_params(void)
{
int value;
if (orte_params_set) {
return ORTE_SUCCESS;
}
mca_base_param_reg_int_name("orte", "debug",
"Top-level ORTE debug switch (default verbosity: 1)",
false, false, (int)false, &value);
orte_debug_flag = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "debug_verbose",
"Verbosity level for ORTE debug messages (default: 1)",
false, false, -1, &orte_debug_verbosity);
mca_base_param_reg_int_name("orte", "debug_daemons",
"Whether to debug the ORTE daemons or not",
false, false, (int)false, &value);
orte_debug_daemons_flag = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "debug_daemons_file",
"Whether want stdout/stderr of daemons to go to a file or not",
false, false, (int)false, &value);
orte_debug_daemons_file_flag = OPAL_INT_TO_BOOL(value);
/* If --debug-daemons-file was specified, that also implies
--debug-daemons */
if (orte_debug_daemons_file_flag) {
orte_debug_daemons_flag = true;
}
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);
orte_do_not_launch = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "daemon_spin",
"Have any orteds spin until we can connect a debugger to them",
false, false, (int)false, &value);
orted_spin_flag = OPAL_INT_TO_BOOL(value);
/* check for timing requests */
mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, (int)false, &value);
orte_timing = OPAL_INT_TO_BOOL(value);
/* User-level debugger info string */
mca_base_param_reg_string_name("orte", "base_user_debugger",
"Sequence of user-level debuggers to search for in orterun",
false, false, "totalview @mpirun@ -a @mpirun_args@ : ddt -n @np@ -start @executable@ @executable_argv@ @single_app@ : fxp @mpirun@ -a @mpirun_args@", NULL);
mca_base_param_reg_int_name("orte", "abort_timeout",
"Max time to wait [in secs] before aborting an ORTE operation (default: 1sec)",
false, false, 1, &value);
orte_max_timeout = 1000000.0 * value; /* convert to usec */
mca_base_param_reg_int_name("orte", "timeout_step",
"Time to wait [in usecs/proc] before aborting an ORTE operation (default: 100 usec/proc)",
false, false, 100, &orte_timeout_usec_per_proc);
/* default hostfile */
mca_base_param_reg_string_name("orte", "default_hostfile",
"Name of the default hostfile (relative or absolute path)",
false, false, NULL, &orte_default_hostfile);
/* whether or not to keep FQDN hostnames */
mca_base_param_reg_int_name("orte", "keep_fqdn_hostnames",
"Whether or not to keep FQDN hostnames [default: no]",
false, false, (int)false, &value);
orte_keep_fqdn_hostnames = OPAL_INT_TO_BOOL(value);
/* whether or not contiguous nodenames are in use */
mca_base_param_reg_int_name("orte", "contiguous_nodes",
"Number of nodes after which contiguous nodename encoding will automatically be used [default: INT_MAX]",
false, false, INT32_MAX, &orte_contiguous_nodes);
mca_base_param_reg_int_name("orte", "base_help_aggregate",
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
false, false,
(int) orte_help_want_aggregate, &value);
orte_help_want_aggregate = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "base_show_output_recursions",
"If orte_base_show_output_recursion is true, recursive calls to orte_output will be reported to stderr",
false, false,
(int) false, &value);
orte_help_show_recursions = OPAL_INT_TO_BOOL(value);
/* some params that are accessed elsewhere, but simply registered here so they will
* be visible to ompi_info
*/
mca_base_param_reg_string_name("orte", "tmpdir_base",
"Base of the session directory tree",
false, false, NULL, &(orte_process_info.tmpdir_base));
/* All done */
orte_params_set = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -80,31 +80,6 @@ int orte_output_get_verbosity(int output_id)
return opal_output_get_verbosity(output_id);
}
/* Whether we aggregate show_help() messages or not */
static bool want_aggregate = true;
/* Whether to report recursions or not */
static bool show_recursions;
static void register_mca(void)
{
int tmp;
mca_base_param_reg_int_name("orte", "base_help_aggregate",
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
false, false,
(int) want_aggregate, &tmp);
want_aggregate = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int_name("orte", "base_show_output_recursions",
"If orte_base_show_output_recursion is true, recursive calls to orte_output will be reported to stderr",
false, false,
(int) false, &tmp);
show_recursions = OPAL_INT_TO_BOOL(tmp);
}
/************************************************************************/
@ -118,7 +93,7 @@ int orte_output_init(void)
{
stderr_stream = opal_output_open(NULL);
regiester_mca();
if (0 == ORTE_PROC_MY_NAME->vpid && want_aggregate) {
if (0 == ORTE_PROC_MY_NAME->vpid && orte_help_want_aggregate) {
orte_output(stderr_stream, "WARNING: orte_base_help_aggregate was set to true, but this system does not support help message aggregation");
}
return ORTE_SUCCESS;
@ -429,7 +404,7 @@ static void output_vverbose(int verbose_level, int output_id,
HNP).
*/
if (am_inside) {
if (show_recursions) {
if (orte_help_want_aggregate) {
opal_output(0, "%s orte_output recursion detected!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
opal_output(output_id, filtered);
@ -474,9 +449,12 @@ static void output_vverbose(int verbose_level, int output_id,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
filtered, output_id));
/* If RML is not yet setup, then just output this locally.
What else can we do? */
if (NULL == orte_rml.send_buffer) {
/* If RML is not yet setup, or we haven't yet defined the HNP,
* then just output this locally.
* What else can we do?
*/
if (NULL == orte_rml.send_buffer ||
ORTE_PROC_MY_HNP->vpid == ORTE_VPID_INVALID) {
opal_output(0, filtered);
} else {
/* setup a buffer to send to the HNP */
@ -541,7 +519,7 @@ static int show_help(const char *filename, const char *topic,
/* If we're aggregating, check for duplicates. Otherwise, don't
track duplicates at all and always display the message. */
if (orte_output_ready && want_aggregate) {
if (orte_output_ready && orte_help_want_aggregate) {
rc = get_tli(filename, topic, &tli);
} else {
rc = ORTE_ERR_NOT_FOUND;
@ -598,7 +576,7 @@ static int show_help(const char *filename, const char *topic,
}
/* If we're aggregating, add this process name to the list */
if (orte_output_ready && want_aggregate) {
if (orte_output_ready && orte_help_want_aggregate) {
pnli = OBJ_NEW(process_name_list_item_t);
if (NULL == pnli) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
@ -711,8 +689,6 @@ int orte_output_init(void)
{
OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "orte_output init"));
register_mca();
/* define the default stream that has everything off */
OBJ_CONSTRUCT(&orte_output_default, opal_output_stream_t);
@ -1031,11 +1007,17 @@ int orte_show_help(const char *filename, const char *topic,
return ORTE_SUCCESS;
}
if (orte_process_info.hnp) {
/* if we are the HNP, or the RML has not yet been setup,
* or we don't yet know our HNP, then all we can do
* is process this locally
*/
if (orte_process_info.hnp ||
NULL == orte_rml.send_buffer ||
ORTE_PROC_MY_HNP->vpid == ORTE_VPID_INVALID) {
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
}
/* if we are not the HNP, then we must relay the output message to
/* otherwise, we relay the output message to
* the HNP for processing
*/
else {

Просмотреть файл

@ -42,7 +42,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .my_name = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID},
/* .my_daemon = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID},
/* .my_daemon_uri = */ NULL,
/* .my_hnp = */ {0, 0},
/* .my_hnp = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID},
/* .my_hnp_uri = */ NULL,
/* .hnp_pid = */ 0,
/* ,app_num = */ -1,
@ -134,10 +134,6 @@ int orte_proc_info(void)
true, false, -1, &tmp);
orte_process_info.universe_size = tmp;
mca_base_param_reg_string_name("orte", "tmpdir_base",
"Base of the session directory tree",
false, false, NULL, &(orte_process_info.tmpdir_base));
/* get the process id */
orte_process_info.pid = getpid();