Some relatively minor C/R related cleanup
* Fix a configure warning for checking --enable-ft-thread * In hnp and orted ErrMgr components check to see if other components have already recovered this process before trying to recover it again. * Fix 'npernode' for restarting using the resilient rmaps component * export ompi_info_set, so that internal functionality can use it. This commit was SVN r23535.
Этот коммит содержится в:
родитель
ea7bf2bd9e
Коммит
ba7e94dd89
@ -1,6 +1,6 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||||
@ -148,7 +148,7 @@ int ompi_info_dup (ompi_info_t *info, ompi_info_t **newinfo);
|
|||||||
* @retval MPI_SUCCESS upon success
|
* @retval MPI_SUCCESS upon success
|
||||||
* @retval MPI_ERR_NO_MEM if out of memory
|
* @retval MPI_ERR_NO_MEM if out of memory
|
||||||
*/
|
*/
|
||||||
int ompi_info_set (ompi_info_t *info, char *key, char *value);
|
OMPI_DECLSPEC int ompi_info_set (ompi_info_t *info, char *key, char *value);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ompi_info_free - Free an 'MPI_Info' object.
|
* ompi_info_free - Free an 'MPI_Info' object.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
dnl
|
dnl
|
||||||
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
dnl University Research and Technology
|
dnl University Research and Technology
|
||||||
dnl Corporation. All rights reserved.
|
dnl Corporation. All rights reserved.
|
||||||
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
|
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
@ -248,7 +248,7 @@ elif test "$enable_ft_thread" = "undef" -a "$enable_opal_multi_threads" = "no" ;
|
|||||||
else
|
else
|
||||||
# Default: Enable
|
# Default: Enable
|
||||||
# Make sure we have OPAL Threads enabled
|
# Make sure we have OPAL Threads enabled
|
||||||
if "$enable_opal_multi_threads" = "no"; then
|
if test "$enable_opal_multi_threads" = "no"; then
|
||||||
AC_MSG_RESULT([Must enable OPAL basic thread support to use this option])
|
AC_MSG_RESULT([Must enable OPAL basic thread support to use this option])
|
||||||
AC_MSG_ERROR([Cannot continue])
|
AC_MSG_ERROR([Cannot continue])
|
||||||
else
|
else
|
||||||
|
@ -148,6 +148,18 @@ static int update_state(orte_jobid_t job,
|
|||||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||||
orte_proc_state_to_str(state), pid, exit_code));
|
orte_proc_state_to_str(state), pid, exit_code));
|
||||||
|
|
||||||
|
/********************************
|
||||||
|
* If the modules before us recovered from this error, then do not abort.
|
||||||
|
********************************/
|
||||||
|
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||||
|
"errmgr:hnp:update_proc() %s) "
|
||||||
|
"------- A previous component successfully recovered from the process fault of %s! Continuing...",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(proc)));
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if orte is trying to shutdown, just let it
|
* if orte is trying to shutdown, just let it
|
||||||
*/
|
*/
|
||||||
@ -328,7 +340,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
case ORTE_PROC_STATE_ABORTED:
|
case ORTE_PROC_STATE_ABORTED:
|
||||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||||
if (jdata->enable_recovery) {
|
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) {
|
||||||
/* is this a local proc */
|
/* is this a local proc */
|
||||||
if (NULL != (child = proc_is_local(proc))) {
|
if (NULL != (child = proc_is_local(proc))) {
|
||||||
/* local proc - see if it has reached its local restart limit */
|
/* local proc - see if it has reached its local restart limit */
|
||||||
@ -455,7 +467,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
/* purge the oob */
|
/* purge the oob */
|
||||||
orte_rml.purge(proc);
|
orte_rml.purge(proc);
|
||||||
|
|
||||||
if (orte_enable_recovery) {
|
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
|
||||||
/* relocate its processes */
|
/* relocate its processes */
|
||||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
|
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
|
||||||
/* unable to relocate for some reason */
|
/* unable to relocate for some reason */
|
||||||
@ -493,7 +505,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
|
|
||||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||||
/* heartbeats are only from daemons */
|
/* heartbeats are only from daemons */
|
||||||
if (orte_enable_recovery) {
|
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
|
||||||
/* relocate its processes */
|
/* relocate its processes */
|
||||||
} else {
|
} else {
|
||||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||||
|
@ -118,7 +118,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
orte_errmgr_stack_state_t *stack_state)
|
orte_errmgr_stack_state_t *stack_state)
|
||||||
{
|
{
|
||||||
opal_list_item_t *item, *next;
|
opal_list_item_t *item, *next;
|
||||||
orte_odls_job_t *jobdat;
|
orte_odls_job_t *jobdat = NULL;
|
||||||
orte_odls_child_t *child;
|
orte_odls_child_t *child;
|
||||||
opal_buffer_t alert;
|
opal_buffer_t alert;
|
||||||
orte_plm_cmd_flag_t cmd;
|
orte_plm_cmd_flag_t cmd;
|
||||||
@ -315,7 +315,8 @@ static int update_state(orte_jobid_t job,
|
|||||||
killprocs(proc->jobid, proc->vpid);
|
killprocs(proc->jobid, proc->vpid);
|
||||||
}
|
}
|
||||||
app = jobdat->apps[child->app_idx];
|
app = jobdat->apps[child->app_idx];
|
||||||
if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
|
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) &&
|
||||||
|
jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
|
||||||
child->restarts++;
|
child->restarts++;
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||||
"%s errmgr:orted restarting proc %s for the %d time",
|
"%s errmgr:orted restarting proc %s for the %d time",
|
||||||
@ -329,7 +330,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||||
if (jobdat->enable_recovery) {
|
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||||
"%s RECOVERY ENABLED",
|
"%s RECOVERY ENABLED",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
@ -297,19 +297,27 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
*/
|
*/
|
||||||
nd = oldnode; /* Put it back where it was if nothing else is found */
|
nd = oldnode; /* Put it back where it was if nothing else is found */
|
||||||
totprocs = 1000000;
|
totprocs = 1000000;
|
||||||
|
found = false;
|
||||||
/* find the lightest loaded node while deconstructing the list */
|
/* find the lightest loaded node while deconstructing the list */
|
||||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
node = (orte_node_t*)item;
|
node = (orte_node_t*)item;
|
||||||
if( node->num_procs < totprocs) {
|
if( !found ) {
|
||||||
|
if( ((int)node->num_procs) < orte_rmaps_base.npernode ) {
|
||||||
|
nd = node;
|
||||||
|
totprocs = 0;
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
else if( node->num_procs < totprocs) {
|
||||||
nd = node;
|
nd = node;
|
||||||
totprocs = node->num_procs;
|
totprocs = node->num_procs;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&node_list);
|
OBJ_DESTRUCT(&node_list);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||||
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s",
|
"%s rmaps:resilient: Placing process on node %s (no ftgrp)",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
nd->name));
|
nd->name));
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||||
* University Research and Technology
|
* University Research and Technology
|
||||||
* Corporation. All rights reserved.
|
* Corporation. All rights reserved.
|
||||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||||
@ -492,12 +492,12 @@ int orte_register_params(void)
|
|||||||
if (ORTE_PROC_IS_HNP) {
|
if (ORTE_PROC_IS_HNP) {
|
||||||
opal_output(orte_clean_output,
|
opal_output(orte_clean_output,
|
||||||
"------------------------------------------------------------------\n"
|
"------------------------------------------------------------------\n"
|
||||||
"The MCA param errmgr_base_enable_recovery was not set to true, but\n"
|
"The MCA param orte_enable_recovery was not set to true, but\n"
|
||||||
"positive value(s) were provided for the number of restarts:\n\n"
|
"positive value(s) were provided for the number of restarts:\n\n"
|
||||||
"Max global restarts: %d\n"
|
"Max global restarts: %d\n"
|
||||||
"Max local restarts: %d\n\n"
|
"Max local restarts: %d\n\n"
|
||||||
"We are enabling process recovery and continuing execution. To avoid\n"
|
"We are enabling process recovery and continuing execution. To avoid\n"
|
||||||
"this warning in the future, please set the errmgr_base_enable_recovery\n"
|
"this warning in the future, please set the orte_enable_recovery\n"
|
||||||
"param to non-zero.\n"
|
"param to non-zero.\n"
|
||||||
"------------------------------------------------------------------",
|
"------------------------------------------------------------------",
|
||||||
orte_max_global_restarts, orte_max_local_restarts);
|
orte_max_global_restarts, orte_max_local_restarts);
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user