Some relatively minor C/R related cleanup
* Fix a configure warning for checking --enable-ft-thread * In hnp and orted ErrMgr components check to see if other components have already recovered this process before trying to recover it again. * Fix 'npernode' for restarting using the resilient rmaps component * export ompi_info_set, so that internal functionality can use it. This commit was SVN r23535.
Этот коммит содержится в:
родитель
ea7bf2bd9e
Коммит
ba7e94dd89
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
@ -148,7 +148,7 @@ int ompi_info_dup (ompi_info_t *info, ompi_info_t **newinfo);
|
||||
* @retval MPI_SUCCESS upon success
|
||||
* @retval MPI_ERR_NO_MEM if out of memory
|
||||
*/
|
||||
int ompi_info_set (ompi_info_t *info, char *key, char *value);
|
||||
OMPI_DECLSPEC int ompi_info_set (ompi_info_t *info, char *key, char *value);
|
||||
|
||||
/**
|
||||
* ompi_info_free - Free an 'MPI_Info' object.
|
||||
|
@ -1,5 +1,5 @@
|
||||
dnl
|
||||
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
dnl University Research and Technology
|
||||
dnl Corporation. All rights reserved.
|
||||
dnl Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
@ -248,7 +248,7 @@ elif test "$enable_ft_thread" = "undef" -a "$enable_opal_multi_threads" = "no" ;
|
||||
else
|
||||
# Default: Enable
|
||||
# Make sure we have OPAL Threads enabled
|
||||
if "$enable_opal_multi_threads" = "no"; then
|
||||
if test "$enable_opal_multi_threads" = "no"; then
|
||||
AC_MSG_RESULT([Must enable OPAL basic thread support to use this option])
|
||||
AC_MSG_ERROR([Cannot continue])
|
||||
else
|
||||
|
@ -148,6 +148,18 @@ static int update_state(orte_jobid_t job,
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), pid, exit_code));
|
||||
|
||||
/********************************
|
||||
* If the modules before us recovered from this error, then do not abort.
|
||||
********************************/
|
||||
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:hnp:update_proc() %s) "
|
||||
"------- A previous component successfully recovered from the process fault of %s! Continuing...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
@ -328,7 +340,7 @@ static int update_state(orte_jobid_t job,
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
if (jdata->enable_recovery) {
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) {
|
||||
/* is this a local proc */
|
||||
if (NULL != (child = proc_is_local(proc))) {
|
||||
/* local proc - see if it has reached its local restart limit */
|
||||
@ -455,7 +467,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
|
||||
if (orte_enable_recovery) {
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
|
||||
/* relocate its processes */
|
||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
|
||||
/* unable to relocate for some reason */
|
||||
@ -493,7 +505,7 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
/* heartbeats are only from daemons */
|
||||
if (orte_enable_recovery) {
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
|
||||
/* relocate its processes */
|
||||
} else {
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
|
@ -118,7 +118,7 @@ static int update_state(orte_jobid_t job,
|
||||
orte_errmgr_stack_state_t *stack_state)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_job_t *jobdat;
|
||||
orte_odls_job_t *jobdat = NULL;
|
||||
orte_odls_child_t *child;
|
||||
opal_buffer_t alert;
|
||||
orte_plm_cmd_flag_t cmd;
|
||||
@ -315,7 +315,8 @@ static int update_state(orte_jobid_t job,
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) &&
|
||||
jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted restarting proc %s for the %d time",
|
||||
@ -329,7 +330,7 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||
if (jobdat->enable_recovery) {
|
||||
if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s RECOVERY ENABLED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -297,19 +297,27 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
*/
|
||||
nd = oldnode; /* Put it back where it was if nothing else is found */
|
||||
totprocs = 1000000;
|
||||
found = false;
|
||||
/* find the lightest loaded node while deconstructing the list */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
if( node->num_procs < totprocs) {
|
||||
if( !found ) {
|
||||
if( ((int)node->num_procs) < orte_rmaps_base.npernode ) {
|
||||
nd = node;
|
||||
totprocs = 0;
|
||||
found = true;
|
||||
}
|
||||
else if( node->num_procs < totprocs) {
|
||||
nd = node;
|
||||
totprocs = node->num_procs;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s",
|
||||
"%s rmaps:resilient: Placing process on node %s (no ftgrp)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nd->name));
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
@ -492,12 +492,12 @@ int orte_register_params(void)
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
opal_output(orte_clean_output,
|
||||
"------------------------------------------------------------------\n"
|
||||
"The MCA param errmgr_base_enable_recovery was not set to true, but\n"
|
||||
"The MCA param orte_enable_recovery was not set to true, but\n"
|
||||
"positive value(s) were provided for the number of restarts:\n\n"
|
||||
"Max global restarts: %d\n"
|
||||
"Max local restarts: %d\n\n"
|
||||
"We are enabling process recovery and continuing execution. To avoid\n"
|
||||
"this warning in the future, please set the errmgr_base_enable_recovery\n"
|
||||
"this warning in the future, please set the orte_enable_recovery\n"
|
||||
"param to non-zero.\n"
|
||||
"------------------------------------------------------------------",
|
||||
orte_max_global_restarts, orte_max_local_restarts);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user