From ba7e94dd89889e29cb64632db2ee75dbd6bfe58b Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Fri, 30 Jul 2010 18:59:34 +0000 Subject: [PATCH] Some relatively minor C/R related cleanup * Fix a configure warning for checking --enable-ft-thread * In hnp and orted ErrMgr components check to see if other components have already recovered this process before trying to recover it again. * Fix 'npernode' for restarting using the resilient rmaps component * export ompi_info_set, so that internal functionality can use it. This commit was SVN r23535. --- ompi/info/info.h | 4 ++-- opal/config/opal_config_threads.m4 | 4 ++-- orte/mca/errmgr/hnp/errmgr_hnp.c | 18 +++++++++++++++--- orte/mca/errmgr/orted/errmgr_orted.c | 7 ++++--- orte/mca/rmaps/resilient/rmaps_resilient.c | 18 +++++++++++++----- orte/runtime/orte_mca_params.c | 6 +++--- 6 files changed, 39 insertions(+), 18 deletions(-) diff --git a/ompi/info/info.h b/ompi/info/info.h index 18e2762a71..672fcfb5e3 100644 --- a/ompi/info/info.h +++ b/ompi/info/info.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University @@ -148,7 +148,7 @@ int ompi_info_dup (ompi_info_t *info, ompi_info_t **newinfo); * @retval MPI_SUCCESS upon success * @retval MPI_ERR_NO_MEM if out of memory */ -int ompi_info_set (ompi_info_t *info, char *key, char *value); +OMPI_DECLSPEC int ompi_info_set (ompi_info_t *info, char *key, char *value); /** * ompi_info_free - Free an 'MPI_Info' object. diff --git a/opal/config/opal_config_threads.m4 b/opal/config/opal_config_threads.m4 index cfa402065d..74a45f76c2 100644 --- a/opal/config/opal_config_threads.m4 +++ b/opal/config/opal_config_threads.m4 @@ -1,5 +1,5 @@ dnl -dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana dnl University Research and Technology dnl Corporation. All rights reserved. dnl Copyright (c) 2004-2005 The University of Tennessee and The University @@ -248,7 +248,7 @@ elif test "$enable_ft_thread" = "undef" -a "$enable_opal_multi_threads" = "no" ; else # Default: Enable # Make sure we have OPAL Threads enabled - if "$enable_opal_multi_threads" = "no"; then + if test "$enable_opal_multi_threads" = "no"; then AC_MSG_RESULT([Must enable OPAL basic thread support to use this option]) AC_MSG_ERROR([Cannot continue]) else diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 181f6490cc..f0b0f24f60 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -148,6 +148,18 @@ static int update_state(orte_jobid_t job, (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state), pid, exit_code)); + /******************************** + * If the modules before us recovered from this error, then do not abort. + ********************************/ + if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (*stack_state)) ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "errmgr:hnp:update_proc() %s) " + "------- A previous component successfully recovered from the process fault of %s! Continuing...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + return ORTE_SUCCESS; + } + /* * if orte is trying to shutdown, just let it */ @@ -328,7 +340,7 @@ static int update_state(orte_jobid_t job, case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_TERM_WO_SYNC: - if (jdata->enable_recovery) { + if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jdata->enable_recovery) { /* is this a local proc */ if (NULL != (child = proc_is_local(proc))) { /* local proc - see if it has reached its local restart limit */ @@ -455,7 +467,7 @@ static int update_state(orte_jobid_t job, /* purge the oob */ orte_rml.purge(proc); - if (orte_enable_recovery) { + if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) { /* relocate its processes */ if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { /* unable to relocate for some reason */ @@ -493,7 +505,7 @@ static int update_state(orte_jobid_t job, case ORTE_PROC_STATE_HEARTBEAT_FAILED: /* heartbeats are only from daemons */ - if (orte_enable_recovery) { + if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && orte_enable_recovery) { /* relocate its processes */ } else { record_dead_daemon(jdata, proc->vpid, state, exit_code); diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index b81916fa68..81cddb2bec 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -118,7 +118,7 @@ static int update_state(orte_jobid_t job, orte_errmgr_stack_state_t *stack_state) { opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; + orte_odls_job_t *jobdat = NULL; orte_odls_child_t *child; opal_buffer_t alert; orte_plm_cmd_flag_t cmd; @@ -315,7 +315,8 @@ static int update_state(orte_jobid_t job, killprocs(proc->jobid, proc->vpid); } app = jobdat->apps[child->app_idx]; - if (jobdat->enable_recovery && child->restarts < app->max_local_restarts) { + if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && + jobdat->enable_recovery && child->restarts < app->max_local_restarts) { child->restarts++; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted restarting proc %s for the %d time", @@ -329,7 +330,7 @@ static int update_state(orte_jobid_t job, } if (ORTE_PROC_STATE_TERMINATED < state) { - if (jobdat->enable_recovery) { + if (!(ORTE_ERRMGR_STACK_STATE_RECOVERED & (*stack_state)) && jobdat->enable_recovery) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s RECOVERY ENABLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index fdb5910805..b967f71532 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -117,7 +117,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; - int rc=ORTE_SUCCESS; + int rc = ORTE_SUCCESS; float avgload, minload; orte_node_t *node, *nd=NULL, *oldnode; orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL; @@ -297,19 +297,27 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) */ nd = oldnode; /* Put it back where it was if nothing else is found */ totprocs = 1000000; + found = false; /* find the lightest loaded node while deconstructing the list */ while (NULL != (item = opal_list_remove_first(&node_list))) { node = (orte_node_t*)item; - if( node->num_procs < totprocs) { - nd = node; - totprocs = node->num_procs; + if( !found ) { + if( ((int)node->num_procs) < orte_rmaps_base.npernode ) { + nd = node; + totprocs = 0; + found = true; + } + else if( node->num_procs < totprocs) { + nd = node; + totprocs = node->num_procs; + } } OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: no avail fault groups found - placing proc on node %s", + "%s rmaps:resilient: Placing process on node %s (no ftgrp)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nd->name)); diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 314863a7e6..6aa586a225 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2008 The University of Tennessee and The University @@ -492,12 +492,12 @@ int orte_register_params(void) if (ORTE_PROC_IS_HNP) { opal_output(orte_clean_output, "------------------------------------------------------------------\n" - "The MCA param errmgr_base_enable_recovery was not set to true, but\n" + "The MCA param orte_enable_recovery was not set to true, but\n" "positive value(s) were provided for the number of restarts:\n\n" "Max global restarts: %d\n" "Max local restarts: %d\n\n" "We are enabling process recovery and continuing execution. To avoid\n" - "this warning in the future, please set the errmgr_base_enable_recovery\n" + "this warning in the future, please set the orte_enable_recovery\n" "param to non-zero.\n" "------------------------------------------------------------------", orte_max_global_restarts, orte_max_local_restarts);