From c6f049972067e72fc02c0e8eaa943d02ebafd808 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 11 May 2009 14:03:07 +0000 Subject: [PATCH] Some cleanups required from last night's commits to resolve some race conditions and ensure we cleanup properly. Also, remove some debug output that was unintentionally left "on" by default. This commit was SVN r21202. --- orte/mca/ess/hnp/ess_hnp_module.c | 10 ++++--- orte/mca/plm/base/plm_base_launch_support.c | 5 +++- orte/runtime/orte_globals.c | 31 +++++++++++++++------ orte/runtime/orte_globals.h | 3 ++ 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index b4b10f00ef..6290cda8e6 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -485,6 +485,8 @@ static int rte_finalize(void) { char *contact_path; opal_list_item_t *item; + orte_node_t *node; + orte_job_t *job; int i; /* remove my contact info file */ @@ -541,16 +543,16 @@ static int rte_finalize(void) /* cleanup the job and node info arrays */ if (NULL != orte_node_pool) { for (i=0; i < orte_node_pool->size; i++) { - if (NULL != orte_node_pool->addr[i]) { - OBJ_RELEASE(orte_node_pool->addr[i]); + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool,i))) { + OBJ_RELEASE(node); } } OBJ_RELEASE(orte_node_pool); } if (NULL != orte_job_data) { for (i=0; i < orte_job_data->size; i++) { - if (NULL != orte_job_data->addr[i]) { - OBJ_RELEASE(orte_job_data->addr[i]); + if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data,i))) { + OBJ_RELEASE(job); } } OBJ_RELEASE(orte_job_data); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 2e9c16942f..05c74b5e1a 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1361,7 +1361,10 @@ CHECK_ALL_JOBS: } node->slots_inuse--; node->num_procs--; - opal_output(0, "releasing proc %s", ORTE_NAME_PRINT(&proc->name)); + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s releasing proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* set the entry in the job data object to NULL */ diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 1d69cee5c0..0a3ee206df 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -123,6 +123,9 @@ bool orte_forward_job_control; /* rsh support */ char *orte_rsh_agent = NULL; +/* orted exit with barrier */ +bool orte_orted_exit_with_barrier = true; + #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; @@ -556,7 +559,15 @@ static void orte_job_destruct(orte_job_t* job) orte_job_t *jdata; int n; - opal_output(0, "Releasing job data for %s", ORTE_JOBID_PRINT(job->jobid)); + if (NULL == job) { + /* probably just a race condition - just return */ + return; + } + + if (orte_debug_flag) { + opal_output(0, "%s Releasing job data for %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid)); + } for (n=0; n < job->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, n))) { @@ -590,14 +601,16 @@ static void orte_job_destruct(orte_job_t* job) #endif /* find the job in the global array */ - for (n=0; n < orte_job_data->size; n++) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { - continue; - } - if (jdata->jobid == job->jobid) { - /* set the entry to NULL */ - opal_pointer_array_set_item(orte_job_data, n, NULL); - break; + if (NULL != orte_job_data) { + for (n=0; n < orte_job_data->size; n++) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { + continue; + } + if (jdata->jobid == job->jobid) { + /* set the entry to NULL */ + opal_pointer_array_set_item(orte_job_data, n, NULL); + break; + } } } } diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index c98c4a0192..70603416a4 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -510,6 +510,9 @@ ORTE_DECLSPEC extern char *orte_xterm; /* rsh support */ ORTE_DECLSPEC extern char *orte_rsh_agent; +/* whether or not to barrier the orteds upon exit */ +ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier; + #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS