diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index ae2911e963..374f079bb5 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -1,8 +1,8 @@ /* * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. - * * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -855,6 +855,7 @@ static void failed_start(orte_job_t *jdata) /* remove the child from our list */ opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); + jobdat->num_local_procs--; } } } @@ -907,6 +908,7 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); jdata->num_terminated++; + jobdat->num_local_procs--; } else if (ORTE_PROC_STATE_RUNNING) { jdata->num_launched++; } else if (ORTE_PROC_STATE_REGISTERED == state) { @@ -931,8 +933,23 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata, opal_list_item_t *item, *next; orte_odls_child_t *child; orte_proc_t *proct; + orte_odls_job_t *jobdat, *jdat; int i; - + + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jdat = (orte_odls_job_t*)item; + if (jdat->jobid == jdata->jobid) { + jobdat = jdat; + break; + } + } + if (NULL == jobdat) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } + /*** UPDATE LOCAL CHILD ***/ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); @@ -956,6 +973,9 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata, if (!jdata->enable_recovery) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); + if (NULL != jobdat) { + jobdat->num_local_procs--; + } } jdata->num_terminated++; } else if (ORTE_PROC_STATE_RUNNING == state) { diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c index bc8f2cab39..e598c93a32 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * * $COPYRIGHT$ * @@ -281,8 +282,19 @@ int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, return ORTE_SUCCESS; } - /* get the job data object for this process */ - if (NULL == (jdata = orte_get_job_data_object(job))) { + /* + * Get the job data object for this process + */ + if( NULL != proc_name ) { /* Get job from proc's jobid */ + jdata = orte_get_job_data_object(proc_name->jobid); + } else { /* Get from the general job */ + jdata = orte_get_job_data_object(job); + } + if( NULL == jdata ) { + opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); ret = ORTE_ERROR; ORTE_ERROR_LOG(ret); exit_status = ret; diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c index 2516b8660f..678c9fd988 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * * $COPYRIGHT$ * @@ -251,14 +252,16 @@ int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list, /************************ * Set up the Command Line listener again *************************/ - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } + if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true); + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true); + } + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; cleanup: return exit_status; @@ -281,8 +284,19 @@ int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job, return ORTE_SUCCESS; } - /* get the job data object for this process */ - if (NULL == (jdata = orte_get_job_data_object(job))) { + /* + * Get the job data object for this process + */ + if( NULL != proc_name ) { /* Get job from proc's jobid */ + jdata = orte_get_job_data_object(proc_name->jobid); + } else { /* Get from the general job */ + jdata = orte_get_job_data_object(job); + } + if( NULL == jdata ) { + opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); ret = ORTE_ERROR; ORTE_ERROR_LOG(ret); return ret; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 405d5fc3aa..844870559e 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -10,6 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1427,7 +1428,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, char **argvsav=NULL; int inm; opal_event_t *delay; - int num_procs_alive; + int num_procs_alive = 0; orte_nid_t *nid; orte_node_t *node; @@ -2035,7 +2036,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag) { - int rc; + int rc, exit_status = ORTE_SUCCESS; opal_list_item_t *item; orte_odls_child_t *child; @@ -2066,15 +2067,23 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff if (rc < 0 && OPAL_SOS_GET_ERROR_CODE(rc) != ORTE_ERR_ADDRESSEE_UNKNOWN) { /* ignore if the addressee is unknown as a race condition could * have allowed the child to exit before we send it a barrier - * due to the vagaries of the event library + * due to the vagaries of the event library. + * + * If we do get an error it is likely that the orte_local_children + * has changed to reflect it, so we can no longer deliver messages. + * So just break out and return the error code. */ ORTE_ERROR_LOG(rc); + exit_status = rc; + goto cleanup; } } - + + cleanup: opal_condition_signal(&orte_odls_globals.cond); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return ORTE_SUCCESS; + + return exit_status; } @@ -2271,8 +2280,6 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, * so free the info and set it to NULL */ if (child->init_recvd && NULL != child->rml_uri) { - free(child->rml_uri); - child->rml_uri = NULL; child->fini_recvd = true; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync deregistering child %s", @@ -2357,9 +2364,19 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, } rc = ORTE_SUCCESS; OBJ_DESTRUCT(&buffer); - + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls: Finished sending sync ack to child %s (Registering %s)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), (registering ? "True" : "False") )); + /* if we are deregistering, then we are done */ if (!registering) { + orte_routed.delete_route(child->name); + if( NULL != child->rml_uri ) { + free(child->rml_uri); + child->rml_uri = NULL; + } goto CLEANUP; } @@ -2560,6 +2577,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto MOVEON; } + /* if this is a debugger daemon, then just report the state * and return as we aren't monitoring it */ @@ -2593,6 +2611,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) "%s odls:waitpid_fired child %s died by call to abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name))); + child->state = ORTE_PROC_STATE_ABORTED; goto MOVEON; } @@ -2667,9 +2686,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) "%s odls:waitpid_fired child process %s terminated with signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name) )); - /* JJH: Should we decrement the number of local procs on this node here? - * jobdat->num_local_procs--; - */ + /* Do not decrement the number of local procs here. That is handled in the errmgr */ } MOVEON: diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 9a812ef041..7657fde3d2 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -12,6 +12,7 @@ * Copyright (c) 2006-2010 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1928,6 +1929,7 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri) /* clear any pending recvs */ peer->peer_recv_msg = NULL; OPAL_THREAD_UNLOCK(&peer->peer_lock); + mca_oob_tcp_peer_shutdown(peer); } /* delete the entry from the hash table */ opal_hash_table_set_value_uint64(&mca_oob_tcp_component.tcp_peer_names, diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 35c20a1092..78c806c770 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -12,6 +12,7 @@ * Copyright (c) 2006-2007 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -623,6 +624,10 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) { + if( MCA_OOB_TCP_CLOSED == peer->peer_state ) { + goto close_socket; + } + /* giving up and cleanup any pending messages */ if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) { mca_oob_tcp_msg_t *msg; @@ -660,6 +665,7 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) peer->peer_state = MCA_OOB_TCP_FAILED; } + close_socket: if (peer->peer_sd >= 0) { opal_event_del(&peer->peer_recv_event); opal_event_del(&peer->peer_send_event);