1
1

Some relatively minor C/R related cleanup

* Fix a configure warning for checking --enable-ft-thread
 * In hnp and orted ErrMgr components check to see if other components have already recovered this process before trying to recover it again.
 * Fix 'npernode' for restarting using the resilient rmaps component
 * export ompi_info_set, so that internal functionality can use it.

This commit was SVN r23535.
Этот коммит содержится в:
Josh Hursey 2010-07-30 18:59:34 +00:00
родитель ea7bf2bd9e
Коммит ba7e94dd89
6 изменённых файлов: 39 добавлений и 18 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University * Copyright (c) 2004-2007 The University of Tennessee and The University
@ -148,7 +148,7 @@ int ompi_info_dup (ompi_info_t *info, ompi_info_t **newinfo);
* @retval MPI_SUCCESS upon success * @retval MPI_SUCCESS upon success
* @retval MPI_ERR_NO_MEM if out of memory * @retval MPI_ERR_NO_MEM if out of memory
*/ */
int ompi_info_set (ompi_info_t *info, char *key, char *value); OMPI_DECLSPEC int ompi_info_set (ompi_info_t *info, char *key, char *value);
/** /**
* ompi_info_free - Free an 'MPI_Info' object. * ompi_info_free - Free an 'MPI_Info' object.

Просмотреть файл

@ -1,5 +1,5 @@
dnl dnl
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
dnl University Research and Technology dnl University Research and Technology
dnl Corporation. All rights reserved. dnl Corporation. All rights reserved.
dnl Copyright (c) 2004-2005 The University of Tennessee and The University dnl Copyright (c) 2004-2005 The University of Tennessee and The University
@ -248,7 +248,7 @@ elif test "$enable_ft_thread" = "undef" -a "$enable_opal_multi_threads" = "no" ;
else else
# Default: Enable # Default: Enable
# Make sure we have OPAL Threads enabled # Make sure we have OPAL Threads enabled
if "$enable_opal_multi_threads" = "no"; then if test "$enable_opal_multi_threads" = "no"; then
AC_MSG_RESULT([Must enable OPAL basic thread support to use this option]) AC_MSG_RESULT([Must enable OPAL basic thread support to use this option])
AC_MSG_ERROR([Cannot continue]) AC_MSG_ERROR([Cannot continue])
else else

Просмотреть файл

@ -148,6 +148,18 @@ static int update_state(orte_jobid_t job,
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), pid, exit_code)); orte_proc_state_to_str(state), pid, exit_code));
/********************************
* If the modules before us recovered from this error, then do not abort.
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:hnp:update_proc() %s) "
"------- A previous component successfully recovered from the process fault of %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
return ORTE_SUCCESS;
}
/* /*
* if orte is trying to shutdown, just let it * if orte is trying to shutdown, just let it
*/ */
@ -328,7 +340,7 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC: case ORTE_PROC_STATE_TERM_WO_SYNC:
if (jdata->enable_recovery) { if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) {
/* is this a local proc */ /* is this a local proc */
if (NULL != (child = proc_is_local(proc))) { if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its local restart limit */ /* local proc - see if it has reached its local restart limit */
@ -455,7 +467,7 @@ static int update_state(orte_jobid_t job,
/* purge the oob */ /* purge the oob */
orte_rml.purge(proc); orte_rml.purge(proc);
if (orte_enable_recovery) { if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
/* relocate its processes */ /* relocate its processes */
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
/* unable to relocate for some reason */ /* unable to relocate for some reason */
@ -493,7 +505,7 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_HEARTBEAT_FAILED: case ORTE_PROC_STATE_HEARTBEAT_FAILED:
/* heartbeats are only from daemons */ /* heartbeats are only from daemons */
if (orte_enable_recovery) { if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) {
/* relocate its processes */ /* relocate its processes */
} else { } else {
record_dead_daemon(jdata, proc->vpid, state, exit_code); record_dead_daemon(jdata, proc->vpid, state, exit_code);

Просмотреть файл

@ -118,7 +118,7 @@ static int update_state(orte_jobid_t job,
orte_errmgr_stack_state_t *stack_state) orte_errmgr_stack_state_t *stack_state)
{ {
opal_list_item_t *item, *next; opal_list_item_t *item, *next;
orte_odls_job_t *jobdat; orte_odls_job_t *jobdat = NULL;
orte_odls_child_t *child; orte_odls_child_t *child;
opal_buffer_t alert; opal_buffer_t alert;
orte_plm_cmd_flag_t cmd; orte_plm_cmd_flag_t cmd;
@ -315,7 +315,8 @@ static int update_state(orte_jobid_t job,
killprocs(proc->jobid, proc->vpid); killprocs(proc->jobid, proc->vpid);
} }
app = jobdat->apps[child->app_idx]; app = jobdat->apps[child->app_idx];
if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) { if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) &&
jobdat->enable_recovery && child->restarts < app->max_local_restarts) {
child->restarts++; child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time", "%s errmgr:orted restarting proc %s for the %d time",
@ -329,7 +330,7 @@ static int update_state(orte_jobid_t job,
} }
if (ORTE_PROC_STATE_TERMINATED < state) { if (ORTE_PROC_STATE_TERMINATED < state) {
if (jobdat->enable_recovery) { if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s RECOVERY ENABLED", "%s RECOVERY ENABLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -297,19 +297,27 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
*/ */
nd = oldnode; /* Put it back where it was if nothing else is found */ nd = oldnode; /* Put it back where it was if nothing else is found */
totprocs = 1000000; totprocs = 1000000;
found = false;
/* find the lightest loaded node while deconstructing the list */ /* find the lightest loaded node while deconstructing the list */
while (NULL != (item = opal_list_remove_first(&node_list))) { while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item; node = (orte_node_t*)item;
if( node->num_procs < totprocs) { if( !found ) {
if( ((int)node->num_procs) < orte_rmaps_base.npernode ) {
nd = node;
totprocs = 0;
found = true;
}
else if( node->num_procs < totprocs) {
nd = node; nd = node;
totprocs = node->num_procs; totprocs = node->num_procs;
} }
}
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
OBJ_DESTRUCT(&node_list); OBJ_DESTRUCT(&node_list);
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: no avail fault groups found - placing proc on node %s", "%s rmaps:resilient: Placing process on node %s (no ftgrp)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nd->name)); nd->name));

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University * Copyright (c) 2004-2008 The University of Tennessee and The University
@ -492,12 +492,12 @@ int orte_register_params(void)
if (ORTE_PROC_IS_HNP) { if (ORTE_PROC_IS_HNP) {
opal_output(orte_clean_output, opal_output(orte_clean_output,
"------------------------------------------------------------------\n" "------------------------------------------------------------------\n"
"The MCA param errmgr_base_enable_recovery was not set to true, but\n" "The MCA param orte_enable_recovery was not set to true, but\n"
"positive value(s) were provided for the number of restarts:\n\n" "positive value(s) were provided for the number of restarts:\n\n"
"Max global restarts: %d\n" "Max global restarts: %d\n"
"Max local restarts: %d\n\n" "Max local restarts: %d\n\n"
"We are enabling process recovery and continuing execution. To avoid\n" "We are enabling process recovery and continuing execution. To avoid\n"
"this warning in the future, please set the errmgr_base_enable_recovery\n" "this warning in the future, please set the orte_enable_recovery\n"
"param to non-zero.\n" "param to non-zero.\n"
"------------------------------------------------------------------", "------------------------------------------------------------------",
orte_max_global_restarts, orte_max_local_restarts); orte_max_global_restarts, orte_max_local_restarts);