Attempt to cleanup the race condition Rolf keeps encountering in MTT by adding some protection to ensure orted's try to terminate once their local procs die. Also, fix a problem whereby a failure to comm_spawn would result in a hang of the parent process.
cmr=v1.8.2:reviewer=rhc:subject=cache termination cleanups This commit was SVN r32008.
Этот коммит содержится в:
родитель
84193fff6d
Коммит
ab52f16100
@ -132,6 +132,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
orte_job_state_t jobstate;
|
||||
orte_exit_code_t sts;
|
||||
orte_proc_t *aborted_proc;
|
||||
opal_buffer_t *answer;
|
||||
int32_t rc, ret;
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
@ -164,7 +166,10 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
|
||||
orte_never_launched = true;
|
||||
if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
|
||||
/* this is the primary job */
|
||||
orte_never_launched = true;
|
||||
}
|
||||
/* disable routing as we may not have performed the daemon
|
||||
* wireup - e.g., in a managed environment, all the daemons
|
||||
* "phone home", but don't actually wireup into the routed
|
||||
@ -173,6 +178,35 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
orte_routing_is_enabled = false;
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
/* if it was a dynamic spawn, then we better tell them this didn't work */
|
||||
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
|
||||
rc = jobstate;
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s errmgr:hnp sending dyn error release of job %s to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_NAME_PRINT(&jdata->originator)));
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
|
||||
ORTE_RML_TAG_PLM_PROXY,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
@ -318,22 +352,22 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
orte_routed.route_lost(proc);
|
||||
/* if all my routes and local children are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* at least one is still alive */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: at least one proc (%s) still alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proct->name)));
|
||||
goto cleanup;
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* at least one is still alive */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: at least one proc (%s) still alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proct->name)));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* call our appropriate exit procedure */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr_hnp: all routes and children gone - ordering exit",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
/* call our appropriate exit procedure */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr_hnp: all routes and children gone - ordering exit",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: %d routes remain alive",
|
||||
@ -372,6 +406,28 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
*/
|
||||
cleanup_local_proc(jdata, proc);
|
||||
|
||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
||||
* any of our routes or local children remain alive - if not, then
|
||||
* terminate ourselves. */
|
||||
if (orte_orteds_term_ordered) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
|
||||
goto keep_going;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if all my routes and children are gone, then terminate
|
||||
ourselves nicely (i.e., this is a normal termination) */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:hnp all routes gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
keep_going:
|
||||
/* ensure we record the failed proc properly so we can report
|
||||
* the error once we terminate
|
||||
*/
|
||||
|
@ -415,6 +415,34 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
||||
* any of our routes or local children remain alive - if not, then
|
||||
* terminate ourselves. */
|
||||
if (orte_orteds_term_ordered) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
if (child->name.jobid == proc->jobid &&
|
||||
child->name.vpid == proc->vpid) {
|
||||
child->state = state;
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
continue;
|
||||
}
|
||||
if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
|
||||
goto keep_going;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if all my routes and children are gone, then terminate
|
||||
ourselves nicely (i.e., this is a normal termination) */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:orted all routes gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
keep_going:
|
||||
/* if the job hasn't completed and the state is abnormally
|
||||
* terminated, then we need to alert the HNP right away
|
||||
*/
|
||||
|
@ -462,6 +462,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pdata;
|
||||
int i;
|
||||
|
||||
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
||||
"%s state:base:track_procs called for proc %s state %s",
|
||||
@ -527,6 +528,26 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
*/
|
||||
orte_session_dir_finalize(proc);
|
||||
}
|
||||
/* if we are trying to terminate and our routes are
|
||||
* gone, then terminate ourselves IF no local procs
|
||||
* remain (might be some from another job)
|
||||
*/
|
||||
if (orte_orteds_term_ordered &&
|
||||
0 == orte_routed.num_routes()) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
|
||||
/* at least one is still alive */
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* call our appropriate exit procedure */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||
"%s state:base all routes and children gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
goto cleanup;
|
||||
}
|
||||
/* return the allocated slot for reuse */
|
||||
cleanup_node(pdata);
|
||||
/* track job status */
|
||||
|
@ -409,7 +409,7 @@ static void track_procs(int fd, short argc, void *cbdata)
|
||||
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE)) {
|
||||
/* at least one is still alive */
|
||||
goto moveon;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* call our appropriate exit procedure */
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user