Fixes the C/R Automatic Recovery feature when the HNP is also hosting processes locally.
I want to thank Hugo Meyer for reporting this/these bugs. Notes: * Moved over a patch from the stabilization branch that makes sure we close the peer socket in the OOB TCP component fully during shutdown (after the de-registration sync). It also ensures that we free the rml_uri only after we are done communicating with the peer (in the odls_base deregister sync operation). * When an error is detected while delivering messages, we really want to bail out of the loop since the error manager is likely mutating the orte_local_children data structure, so it is no longer safe to iterate over in the orte_odls_base_default_deliver_message() function. * When the HNP is hosting processes make sure it accounts for processes that may have failed locally in the ErrMgr HNP component by decrementing the num_local_procs. This makes it match the orted ErrMgr component accounting. This is what was causing the modex to fail (the number of participants was wrong on a rolling recovery. * The crmig and autor features of the hnp ErrMgr component now check for the jobid from both the 'job' parameter and from the process name (since one may be there and not the other). This caused some additional error messages during startup. * If we fail to migrate (e.g., due to invalid node specification), print only the error message, not the error and success messages. This can be misleading. This commit was SVN r24317.
Этот коммит содержится в:
родитель
5bc2ad2b44
Коммит
8ec85c6b8f
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -855,6 +855,7 @@ static void failed_start(orte_job_t *jdata)
|
||||
/* remove the child from our list */
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
OBJ_RELEASE(child);
|
||||
jobdat->num_local_procs--;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -907,6 +908,7 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
OBJ_RELEASE(child);
|
||||
jdata->num_terminated++;
|
||||
jobdat->num_local_procs--;
|
||||
} else if (ORTE_PROC_STATE_RUNNING) {
|
||||
jdata->num_launched++;
|
||||
} else if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||
@ -931,8 +933,23 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_child_t *child;
|
||||
orte_proc_t *proct;
|
||||
orte_odls_job_t *jobdat, *jdat;
|
||||
int i;
|
||||
|
||||
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jdat = (orte_odls_job_t*)item;
|
||||
if (jdat->jobid == jdata->jobid) {
|
||||
jobdat = jdat;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
}
|
||||
|
||||
/*** UPDATE LOCAL CHILD ***/
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
@ -956,6 +973,9 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
if (!jdata->enable_recovery) {
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
OBJ_RELEASE(child);
|
||||
if (NULL != jobdat) {
|
||||
jobdat->num_local_procs--;
|
||||
}
|
||||
}
|
||||
jdata->num_terminated++;
|
||||
} else if (ORTE_PROC_STATE_RUNNING == state) {
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -281,8 +282,19 @@ int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/*
|
||||
* Get the job data object for this process
|
||||
*/
|
||||
if( NULL != proc_name ) { /* Get job from proc's jobid */
|
||||
jdata = orte_get_job_data_object(proc_name->jobid);
|
||||
} else { /* Get from the general job */
|
||||
jdata = orte_get_job_data_object(job);
|
||||
}
|
||||
if( NULL == jdata ) {
|
||||
opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) );
|
||||
ret = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -251,14 +252,16 @@ int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
/************************
|
||||
* Set up the Command Line listener again
|
||||
*************************/
|
||||
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true);
|
||||
opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true);
|
||||
}
|
||||
current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
@ -281,8 +284,19 @@ int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/*
|
||||
* Get the job data object for this process
|
||||
*/
|
||||
if( NULL != proc_name ) { /* Get job from proc's jobid */
|
||||
jdata = orte_get_job_data_object(proc_name->jobid);
|
||||
} else { /* Get from the general job */
|
||||
jdata = orte_get_job_data_object(job);
|
||||
}
|
||||
if( NULL == jdata ) {
|
||||
opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
(NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) );
|
||||
ret = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1427,7 +1428,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
char **argvsav=NULL;
|
||||
int inm;
|
||||
opal_event_t *delay;
|
||||
int num_procs_alive;
|
||||
int num_procs_alive = 0;
|
||||
orte_nid_t *nid;
|
||||
orte_node_t *node;
|
||||
|
||||
@ -2035,7 +2036,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
|
||||
int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag)
|
||||
{
|
||||
int rc;
|
||||
int rc, exit_status = ORTE_SUCCESS;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
@ -2066,15 +2067,23 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff
|
||||
if (rc < 0 && OPAL_SOS_GET_ERROR_CODE(rc) != ORTE_ERR_ADDRESSEE_UNKNOWN) {
|
||||
/* ignore if the addressee is unknown as a race condition could
|
||||
* have allowed the child to exit before we send it a barrier
|
||||
* due to the vagaries of the event library
|
||||
* due to the vagaries of the event library.
|
||||
*
|
||||
* If we do get an error it is likely that the orte_local_children
|
||||
* has changed to reflect it, so we can no longer deliver messages.
|
||||
* So just break out and return the error code.
|
||||
*/
|
||||
ORTE_ERROR_LOG(rc);
|
||||
exit_status = rc;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
cleanup:
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
|
||||
@ -2271,8 +2280,6 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
* so free the info and set it to NULL
|
||||
*/
|
||||
if (child->init_recvd && NULL != child->rml_uri) {
|
||||
free(child->rml_uri);
|
||||
child->rml_uri = NULL;
|
||||
child->fini_recvd = true;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls: require sync deregistering child %s",
|
||||
@ -2357,9 +2364,19 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
}
|
||||
rc = ORTE_SUCCESS;
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls: Finished sending sync ack to child %s (Registering %s)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc), (registering ? "True" : "False") ));
|
||||
|
||||
/* if we are deregistering, then we are done */
|
||||
if (!registering) {
|
||||
orte_routed.delete_route(child->name);
|
||||
if( NULL != child->rml_uri ) {
|
||||
free(child->rml_uri);
|
||||
child->rml_uri = NULL;
|
||||
}
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
@ -2560,6 +2577,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
/* if this is a debugger daemon, then just report the state
|
||||
* and return as we aren't monitoring it
|
||||
*/
|
||||
@ -2593,6 +2611,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
"%s odls:waitpid_fired child %s died by call to abort",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
child->state = ORTE_PROC_STATE_ABORTED;
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
@ -2667,9 +2686,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
"%s odls:waitpid_fired child process %s terminated with signal",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name) ));
|
||||
/* JJH: Should we decrement the number of local procs on this node here?
|
||||
* jobdat->num_local_procs--;
|
||||
*/
|
||||
/* Do not decrement the number of local procs here. That is handled in the errmgr */
|
||||
}
|
||||
|
||||
MOVEON:
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006-2010 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1928,6 +1929,7 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri)
|
||||
/* clear any pending recvs */
|
||||
peer->peer_recv_msg = NULL;
|
||||
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
}
|
||||
/* delete the entry from the hash table */
|
||||
opal_hash_table_set_value_uint64(&mca_oob_tcp_component.tcp_peer_names,
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -623,6 +624,10 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
|
||||
void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
||||
{
|
||||
if( MCA_OOB_TCP_CLOSED == peer->peer_state ) {
|
||||
goto close_socket;
|
||||
}
|
||||
|
||||
/* giving up and cleanup any pending messages */
|
||||
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
||||
mca_oob_tcp_msg_t *msg;
|
||||
@ -660,6 +665,7 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
||||
peer->peer_state = MCA_OOB_TCP_FAILED;
|
||||
}
|
||||
|
||||
close_socket:
|
||||
if (peer->peer_sd >= 0) {
|
||||
opal_event_del(&peer->peer_recv_event);
|
||||
opal_event_del(&peer->peer_send_event);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user