1
1

Some relatively minor C/R related cleanup

* Fix a configure warning for checking --enable-ft-thread
 * In hnp and orted ErrMgr components check to see if other components have already recovered this process before trying to recover it again.
 * Fix 'npernode' for restarting using the resilient rmaps component
 * export ompi_info_set, so that internal functionality can use it.

This commit was SVN r23535.
Этот коммит содержится в:
Josh Hursey 2010-07-30 18:59:34 +00:00
родитель ea7bf2bd9e
Коммит ba7e94dd89
6 изменённых файлов: 39 добавлений и 18 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
@ -148,7 +148,7 @@ int ompi_info_dup (ompi_info_t *info, ompi_info_t **newinfo);
* @retval MPI_SUCCESS upon success
* @retval MPI_ERR_NO_MEM if out of memory
*/
int ompi_info_set (ompi_info_t *info, char *key, char *value);
OMPI_DECLSPEC int ompi_info_set (ompi_info_t *info, char *key, char *value);
/**
* ompi_info_free - Free an 'MPI_Info' object.

Просмотреть файл

@ -1,5 +1,5 @@
dnl
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
dnl University Research and Technology
dnl Corporation. All rights reserved.
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
@ -248,7 +248,7 @@ elif test "$enable_ft_thread" = "undef" -a "$enable_opal_multi_threads" = "no" ;
else
# Default: Enable
# Make sure we have OPAL Threads enabled
if "$enable_opal_multi_threads" = "no"; then
if test "$enable_opal_multi_threads" = "no"; then
AC_MSG_RESULT([Must enable OPAL basic thread support to use this option])
AC_MSG_ERROR([Cannot continue])
else

Просмотреть файл

@ -148,6 +148,18 @@ static int update_state(orte_jobid_t job,
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), pid, exit_code));
/********************************
* If the modules before us recovered from this error, then do not abort.
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:hnp:update_proc() %s) "
"------- A previous component successfully recovered from the process fault of %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return ORTE_SUCCESS;
}
/*
* if orte is trying to shutdown, just let it
*/
@ -328,7 +340,7 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
if (jdata->enable_recovery) {
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) {
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its local restart limit */
@ -455,7 +467,7 @@ static int update_state(orte_jobid_t job,
/* purge the oob */
orte_rml.purge(proc);
if (orte_enable_recovery) {
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
/* relocate its processes */
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
/* unable to relocate for some reason */
@ -493,7 +505,7 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
/* heartbeats are only from daemons */
if (orte_enable_recovery) {
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
/* relocate its processes */
} else {
record_dead_daemon(jdata, proc->vpid, state, exit_code);

Просмотреть файл

@ -118,7 +118,7 @@ static int update_state(orte_jobid_t job,
orte_errmgr_stack_state_t *stack_state)
{
opal_list_item_t *item, *next;
orte_odls_job_t *jobdat;
orte_odls_job_t *jobdat = NULL;
orte_odls_child_t *child;
opal_buffer_t alert;
orte_plm_cmd_flag_t cmd;
@ -315,7 +315,8 @@ static int update_state(orte_jobid_t job,
killprocs(proc->jobid, proc->vpid);
}
app = jobdat->apps[child->app_idx];
if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) &&
jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
@ -329,7 +330,7 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_TERMINATED < state) {
if (jobdat->enable_recovery) {
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s RECOVERY ENABLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -117,7 +117,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
int rc=ORTE_SUCCESS;
int rc = ORTE_SUCCESS;
float avgload, minload;
orte_node_t *node, *nd=NULL, *oldnode;
orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL;
@ -297,19 +297,27 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
*/
nd = oldnode; /* Put it back where it was if nothing else is found */
totprocs = 1000000;
found = false;
/* find the lightest loaded node while deconstructing the list */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
if( node->num_procs < totprocs) {
nd = node;
totprocs = node->num_procs;
if( !found ) {
if( ((int)node->num_procs) < orte_rmaps_base.npernode ) {
nd = node;
totprocs = 0;
found = true;
}
else if( node->num_procs < totprocs) {
nd = node;
totprocs = node->num_procs;
}
}
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s",
"%s rmaps:resilient: Placing process on node %s (no ftgrp)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nd->name));

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
@ -492,12 +492,12 @@ int orte_register_params(void)
if (ORTE_PROC_IS_HNP) {
opal_output(orte_clean_output,
"------------------------------------------------------------------\n"
"The MCA param errmgr_base_enable_recovery was not set to true, but\n"
"The MCA param orte_enable_recovery was not set to true, but\n"
"positive value(s) were provided for the number of restarts:\n\n"
"Max global restarts: %d\n"
"Max local restarts: %d\n\n"
"We are enabling process recovery and continuing execution. To avoid\n"
"this warning in the future, please set the errmgr_base_enable_recovery\n"
"this warning in the future, please set the orte_enable_recovery\n"
"param to non-zero.\n"
"------------------------------------------------------------------",
orte_max_global_restarts, orte_max_local_restarts);