1
1

Attempt to cleanup the race condition Rolf keeps encountering in MTT by adding some protection to ensure orted's try to terminate once their local procs die. Also, fix a problem whereby a failure to comm_spawn would result in a hang of the parent process.

cmr=v1.8.2:reviewer=rhc:subject=cache termination cleanups

This commit was SVN r32008.
Этот коммит содержится в:
Ralph Castain 2014-06-16 20:46:35 +00:00
родитель 84193fff6d
Коммит ab52f16100
4 изменённых файлов: 122 добавлений и 17 удалений

Просмотреть файл

@ -132,6 +132,8 @@ static void job_errors(int fd, short args, void *cbdata)
orte_job_state_t jobstate;
orte_exit_code_t sts;
orte_proc_t *aborted_proc;
opal_buffer_t *answer;
int32_t rc, ret;
/*
* if orte is trying to shutdown, just let it
@ -164,7 +166,10 @@ static void job_errors(int fd, short args, void *cbdata)
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
orte_never_launched = true;
if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
/* this is the primary job */
orte_never_launched = true;
}
/* disable routing as we may not have performed the daemon
* wireup - e.g., in a managed environment, all the daemons
* "phone home", but don't actually wireup into the routed
@ -173,6 +178,35 @@ static void job_errors(int fd, short args, void *cbdata)
orte_routing_is_enabled = false;
jdata->num_terminated = jdata->num_procs;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
/* if it was a dynamic spawn, then we better tell them this didn't work */
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
rc = jobstate;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
return;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s errmgr:hnp sending dyn error release of job %s to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
ORTE_RML_TAG_PLM_PROXY,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
OBJ_RELEASE(caddy);
return;
}
@ -318,22 +352,22 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_routed.route_lost(proc);
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
/* at least one is still alive */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: at least one proc (%s) still alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proct->name)));
goto cleanup;
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
/* at least one is still alive */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: at least one proc (%s) still alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proct->name)));
goto cleanup;
}
}
}
/* call our appropriate exit procedure */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr_hnp: all routes and children gone - ordering exit",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
/* call our appropriate exit procedure */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr_hnp: all routes and children gone - ordering exit",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
@ -372,6 +406,28 @@ static void proc_errors(int fd, short args, void *cbdata)
*/
cleanup_local_proc(jdata, proc);
/* if we were ordered to terminate, mark this proc as dead and see if
* any of our routes or local children remain alive - if not, then
* terminate ourselves. */
if (orte_orteds_term_ordered) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
goto keep_going;
}
}
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:hnp all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
}
keep_going:
/* ensure we record the failed proc properly so we can report
* the error once we terminate
*/

Просмотреть файл

@ -415,6 +415,34 @@ static void proc_errors(int fd, short args, void *cbdata)
}
if (ORTE_PROC_STATE_TERMINATED < state) {
/* if we were ordered to terminate, mark this proc as dead and see if
* any of our routes or local children remain alive - if not, then
* terminate ourselves. */
if (orte_orteds_term_ordered) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
if (child->name.jobid == proc->jobid &&
child->name.vpid == proc->vpid) {
child->state = state;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
continue;
}
if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
goto keep_going;
}
}
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
}
keep_going:
/* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away
*/

Просмотреть файл

@ -462,6 +462,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
orte_proc_state_t state = caddy->proc_state;
orte_job_t *jdata;
orte_proc_t *pdata;
int i;
opal_output_verbose(5, orte_state_base_framework.framework_output,
"%s state:base:track_procs called for proc %s state %s",
@ -527,6 +528,26 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
*/
orte_session_dir_finalize(proc);
}
/* if we are trying to terminate and our routes are
* gone, then terminate ourselves IF no local procs
* remain (might be some from another job)
*/
if (orte_orteds_term_ordered &&
0 == orte_routed.num_routes()) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
/* at least one is still alive */
goto cleanup;
}
}
/* call our appropriate exit procedure */
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:base all routes and children gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
goto cleanup;
}
/* return the allocated slot for reuse */
cleanup_node(pdata);
/* track job status */

Просмотреть файл

@ -409,7 +409,7 @@ static void track_procs(int fd, short argc, void *cbdata)
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE)) {
/* at least one is still alive */
goto moveon;
goto cleanup;
}
}
/* call our appropriate exit procedure */