Enable the system to keep functioning even when multiple launches are occurring simultaneously.
This is a bit of a hack, but it does seem to allow the system to work. A better solution is being discussed. This commit was SVN r21705.
Этот коммит содержится в:
родитель
03a0b04ab8
Коммит
4c1eb040b0
@ -980,7 +980,8 @@ int orte_plm_base_report_launched(orte_jobid_t job)
|
||||
ORTE_PROGRESSED_WAIT(app_launch_failed, jdata->num_launched, jdata->num_procs);
|
||||
|
||||
/* cancel the lingering recv */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_APP_LAUNCH_CALLBACK))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_APP_LAUNCH_CALLBACK)) &&
|
||||
ORTE_ERR_NOT_FOUND != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -381,6 +381,7 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1394,8 +1394,6 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
}
|
||||
/* adjust job accounting */
|
||||
jdata->num_terminated--;
|
||||
jdata->num_launched--;
|
||||
jdata->num_reported--;
|
||||
}
|
||||
}
|
||||
/* clear the info on who aborted */
|
||||
@ -1404,4 +1402,7 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
OBJ_RELEASE(jdata->aborted_proc); /* maintain reference count */
|
||||
jdata->aborted_proc = NULL;
|
||||
}
|
||||
/* since every daemon will be reporting status for every proc, reset these to zero */
|
||||
jdata->num_launched = 0;
|
||||
jdata->num_reported = 0;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user