1
1

Use a conditioned wait to serialize launches when they come from multiple sources (e.g., an orte application that spawns multiple jobs).

This commit was SVN r21718.
Этот коммит содержится в:
Ralph Castain 2009-07-20 01:51:29 +00:00
родитель 1a5f7245c8
Коммит 6c85d954f3
5 изменённых файлов: 62 добавлений и 7 удалений

Просмотреть файл

@ -61,6 +61,10 @@ int orte_plm_base_close(void)
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_lock);
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_cond);
/* clearout the spawn locks */
OBJ_DESTRUCT(&orte_plm_globals.spawn_lock);
OBJ_DESTRUCT(&orte_plm_globals.spawn_cond);
#ifndef __WINDOWS__
/* clearout the rsh support */
orte_plm_base_local_slave_finalize();

Просмотреть файл

@ -252,7 +252,8 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
rc = ORTE_ERR_BAD_PARAM;
goto WAKEUP;
}
/* setup the buffer */
@ -263,13 +264,13 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
goto WAKEUP;
}
/* get the local launcher's required data */
if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(buffer, job))) {
ORTE_ERROR_LOG(rc);
return rc;
goto WAKEUP;
}
/* if we are timing, record the time we send this message */
@ -282,7 +283,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
buffer, ORTE_RML_TAG_DAEMON))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
goto WAKEUP;
}
OBJ_RELEASE(buffer);
@ -292,7 +293,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
"%s plm:base:launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), ORTE_ERROR_NAME(rc)));
return rc;
goto WAKEUP;
}
if (orte_timing) {
@ -316,13 +317,20 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
ORTE_ERROR_LOG(rc);
return rc;
goto WAKEUP;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch completed for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
WAKEUP:
/* wakeup anyone waiting for this */
orte_plm_globals.spawn_complete = true;
orte_plm_globals.spawn_status = rc;
opal_condition_broadcast(&orte_plm_globals.spawn_cond);
return rc;
}

Просмотреть файл

@ -136,6 +136,13 @@ int orte_plm_base_open(void)
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_cond, opal_condition_t);
/* initialize the condition variables for spawn */
OBJ_CONSTRUCT(&orte_plm_globals.spawn_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_plm_globals.spawn_cond, opal_condition_t);
OBJ_CONSTRUCT(&orte_plm_globals.spawn_in_progress_cond, opal_condition_t);
orte_plm_globals.spawn_complete = false;
orte_plm_globals.spawn_in_progress = false;
/* init the next jobid */
orte_plm_globals.next_jobid = 0;

Просмотреть файл

@ -77,6 +77,18 @@ typedef struct {
orte_jobid_t local_slaves;
/* list of local slave files */
opal_list_t slave_files;
/* spawn lock */
opal_mutex_t spawn_lock;
/* spawn cond */
opal_condition_t spawn_cond;
/* spawn status */
int spawn_status;
/* completion flag */
bool spawn_complete;
/* spawn in progress cond */
opal_condition_t spawn_in_progress_cond;
/* flag */
bool spawn_in_progress;
} orte_plm_globals_t;
/**
* Global instance of PLM framework data

Просмотреть файл

@ -969,6 +969,17 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
/* wait for the launch to complete */
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
while (orte_plm_globals.spawn_in_progress) {
opal_condition_wait(&orte_plm_globals.spawn_in_progress_cond, &orte_plm_globals.spawn_lock);
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "released to spawn"));
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
orte_plm_globals.spawn_in_progress = true;
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will
@ -977,7 +988,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
* provide all the info required to launch the job,
* including the target hosts
*/
return orte_plm_base_local_slave_launch(jdata);
rc = orte_plm_base_local_slave_launch(jdata);
orte_plm_globals.spawn_in_progress = false;
return rc;
}
/* if we are timing, record the start time */
@ -1272,6 +1285,17 @@ launch_apps:
goto cleanup;
}
/* wait for the launch to complete */
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
while (!orte_plm_globals.spawn_complete) {
opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock);
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid)));
orte_plm_globals.spawn_in_progress = false;
opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond);
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
/* get here if launch went okay */
failed_launch = false;