Use a conditioned wait to serialize launches when they come from multiple sources (e.g., an orte application that spawns multiple jobs).
This commit was SVN r21718.
Этот коммит содержится в:
родитель
1a5f7245c8
Коммит
6c85d954f3
@ -61,6 +61,10 @@ int orte_plm_base_close(void)
|
||||
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_lock);
|
||||
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_cond);
|
||||
|
||||
/* clearout the spawn locks */
|
||||
OBJ_DESTRUCT(&orte_plm_globals.spawn_lock);
|
||||
OBJ_DESTRUCT(&orte_plm_globals.spawn_cond);
|
||||
|
||||
#ifndef __WINDOWS__
|
||||
/* clearout the rsh support */
|
||||
orte_plm_base_local_slave_finalize();
|
||||
|
@ -252,7 +252,8 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* bad jobid */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
/* setup the buffer */
|
||||
@ -263,13 +264,13 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
return rc;
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
/* get the local launcher's required data */
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(buffer, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
/* if we are timing, record the time we send this message */
|
||||
@ -282,7 +283,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
buffer, ORTE_RML_TAG_DAEMON))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
return rc;
|
||||
goto WAKEUP;
|
||||
}
|
||||
OBJ_RELEASE(buffer);
|
||||
|
||||
@ -292,7 +293,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
"%s plm:base:launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), ORTE_ERROR_NAME(rc)));
|
||||
return rc;
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
if (orte_timing) {
|
||||
@ -316,13 +317,20 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:launch completed for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
WAKEUP:
|
||||
/* wakeup anyone waiting for this */
|
||||
orte_plm_globals.spawn_complete = true;
|
||||
orte_plm_globals.spawn_status = rc;
|
||||
opal_condition_broadcast(&orte_plm_globals.spawn_cond);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -136,6 +136,13 @@ int orte_plm_base_open(void)
|
||||
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_cond, opal_condition_t);
|
||||
|
||||
/* initialize the condition variables for spawn */
|
||||
OBJ_CONSTRUCT(&orte_plm_globals.spawn_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orte_plm_globals.spawn_cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&orte_plm_globals.spawn_in_progress_cond, opal_condition_t);
|
||||
orte_plm_globals.spawn_complete = false;
|
||||
orte_plm_globals.spawn_in_progress = false;
|
||||
|
||||
/* init the next jobid */
|
||||
orte_plm_globals.next_jobid = 0;
|
||||
|
||||
|
@ -77,6 +77,18 @@ typedef struct {
|
||||
orte_jobid_t local_slaves;
|
||||
/* list of local slave files */
|
||||
opal_list_t slave_files;
|
||||
/* spawn lock */
|
||||
opal_mutex_t spawn_lock;
|
||||
/* spawn cond */
|
||||
opal_condition_t spawn_cond;
|
||||
/* spawn status */
|
||||
int spawn_status;
|
||||
/* completion flag */
|
||||
bool spawn_complete;
|
||||
/* spawn in progress cond */
|
||||
opal_condition_t spawn_in_progress_cond;
|
||||
/* flag */
|
||||
bool spawn_in_progress;
|
||||
} orte_plm_globals_t;
|
||||
/**
|
||||
* Global instance of PLM framework data
|
||||
|
@ -969,6 +969,17 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
orte_jobid_t failed_job;
|
||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||
|
||||
/* wait for the launch to complete */
|
||||
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
|
||||
while (orte_plm_globals.spawn_in_progress) {
|
||||
opal_condition_wait(&orte_plm_globals.spawn_in_progress_cond, &orte_plm_globals.spawn_lock);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "released to spawn"));
|
||||
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||
|
||||
orte_plm_globals.spawn_in_progress = true;
|
||||
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
@ -977,7 +988,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
* provide all the info required to launch the job,
|
||||
* including the target hosts
|
||||
*/
|
||||
return orte_plm_base_local_slave_launch(jdata);
|
||||
rc = orte_plm_base_local_slave_launch(jdata);
|
||||
orte_plm_globals.spawn_in_progress = false;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
@ -1272,6 +1285,17 @@ launch_apps:
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* wait for the launch to complete */
|
||||
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
|
||||
while (!orte_plm_globals.spawn_complete) {
|
||||
opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
orte_plm_globals.spawn_in_progress = false;
|
||||
opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||
|
||||
/* get here if launch went okay */
|
||||
failed_launch = false;
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user