Use a conditioned wait to serialize launches when they come from multiple sources (e.g., an orte application that spawns multiple jobs).
This commit was SVN r21718.
Этот коммит содержится в:
родитель
1a5f7245c8
Коммит
6c85d954f3
@ -61,6 +61,10 @@ int orte_plm_base_close(void)
|
|||||||
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_lock);
|
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_lock);
|
||||||
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_cond);
|
OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_cond);
|
||||||
|
|
||||||
|
/* clearout the spawn locks */
|
||||||
|
OBJ_DESTRUCT(&orte_plm_globals.spawn_lock);
|
||||||
|
OBJ_DESTRUCT(&orte_plm_globals.spawn_cond);
|
||||||
|
|
||||||
#ifndef __WINDOWS__
|
#ifndef __WINDOWS__
|
||||||
/* clearout the rsh support */
|
/* clearout the rsh support */
|
||||||
orte_plm_base_local_slave_finalize();
|
orte_plm_base_local_slave_finalize();
|
||||||
|
@ -252,7 +252,8 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||||
/* bad jobid */
|
/* bad jobid */
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
return ORTE_ERR_BAD_PARAM;
|
rc = ORTE_ERR_BAD_PARAM;
|
||||||
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the buffer */
|
/* setup the buffer */
|
||||||
@ -263,13 +264,13 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_RELEASE(buffer);
|
OBJ_RELEASE(buffer);
|
||||||
return rc;
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get the local launcher's required data */
|
/* get the local launcher's required data */
|
||||||
if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(buffer, job))) {
|
if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(buffer, job))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we are timing, record the time we send this message */
|
/* if we are timing, record the time we send this message */
|
||||||
@ -282,7 +283,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
buffer, ORTE_RML_TAG_DAEMON))) {
|
buffer, ORTE_RML_TAG_DAEMON))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_RELEASE(buffer);
|
OBJ_RELEASE(buffer);
|
||||||
return rc;
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
OBJ_RELEASE(buffer);
|
OBJ_RELEASE(buffer);
|
||||||
|
|
||||||
@ -292,7 +293,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
"%s plm:base:launch failed for job %s on error %s",
|
"%s plm:base:launch failed for job %s on error %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_JOBID_PRINT(job), ORTE_ERROR_NAME(rc)));
|
ORTE_JOBID_PRINT(job), ORTE_ERROR_NAME(rc)));
|
||||||
return rc;
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (orte_timing) {
|
if (orte_timing) {
|
||||||
@ -316,13 +317,20 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
|
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:launch completed for job %s",
|
"%s plm:base:launch completed for job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_JOBID_PRINT(job)));
|
ORTE_JOBID_PRINT(job)));
|
||||||
|
|
||||||
|
WAKEUP:
|
||||||
|
/* wakeup anyone waiting for this */
|
||||||
|
orte_plm_globals.spawn_complete = true;
|
||||||
|
orte_plm_globals.spawn_status = rc;
|
||||||
|
opal_condition_broadcast(&orte_plm_globals.spawn_cond);
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,6 +136,13 @@ int orte_plm_base_open(void)
|
|||||||
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_lock, opal_mutex_t);
|
||||||
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_cond, opal_condition_t);
|
OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_cond, opal_condition_t);
|
||||||
|
|
||||||
|
/* initialize the condition variables for spawn */
|
||||||
|
OBJ_CONSTRUCT(&orte_plm_globals.spawn_lock, opal_mutex_t);
|
||||||
|
OBJ_CONSTRUCT(&orte_plm_globals.spawn_cond, opal_condition_t);
|
||||||
|
OBJ_CONSTRUCT(&orte_plm_globals.spawn_in_progress_cond, opal_condition_t);
|
||||||
|
orte_plm_globals.spawn_complete = false;
|
||||||
|
orte_plm_globals.spawn_in_progress = false;
|
||||||
|
|
||||||
/* init the next jobid */
|
/* init the next jobid */
|
||||||
orte_plm_globals.next_jobid = 0;
|
orte_plm_globals.next_jobid = 0;
|
||||||
|
|
||||||
|
@ -77,6 +77,18 @@ typedef struct {
|
|||||||
orte_jobid_t local_slaves;
|
orte_jobid_t local_slaves;
|
||||||
/* list of local slave files */
|
/* list of local slave files */
|
||||||
opal_list_t slave_files;
|
opal_list_t slave_files;
|
||||||
|
/* spawn lock */
|
||||||
|
opal_mutex_t spawn_lock;
|
||||||
|
/* spawn cond */
|
||||||
|
opal_condition_t spawn_cond;
|
||||||
|
/* spawn status */
|
||||||
|
int spawn_status;
|
||||||
|
/* completion flag */
|
||||||
|
bool spawn_complete;
|
||||||
|
/* spawn in progress cond */
|
||||||
|
opal_condition_t spawn_in_progress_cond;
|
||||||
|
/* flag */
|
||||||
|
bool spawn_in_progress;
|
||||||
} orte_plm_globals_t;
|
} orte_plm_globals_t;
|
||||||
/**
|
/**
|
||||||
* Global instance of PLM framework data
|
* Global instance of PLM framework data
|
||||||
|
@ -969,6 +969,17 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job;
|
||||||
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
|
||||||
|
|
||||||
|
/* wait for the launch to complete */
|
||||||
|
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
|
||||||
|
while (orte_plm_globals.spawn_in_progress) {
|
||||||
|
opal_condition_wait(&orte_plm_globals.spawn_in_progress_cond, &orte_plm_globals.spawn_lock);
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "released to spawn"));
|
||||||
|
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||||
|
|
||||||
|
orte_plm_globals.spawn_in_progress = true;
|
||||||
|
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
|
||||||
|
|
||||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
/* if this is a request to launch a local slave,
|
/* if this is a request to launch a local slave,
|
||||||
* then we will not be launching an orted - we will
|
* then we will not be launching an orted - we will
|
||||||
@ -977,7 +988,9 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
* provide all the info required to launch the job,
|
* provide all the info required to launch the job,
|
||||||
* including the target hosts
|
* including the target hosts
|
||||||
*/
|
*/
|
||||||
return orte_plm_base_local_slave_launch(jdata);
|
rc = orte_plm_base_local_slave_launch(jdata);
|
||||||
|
orte_plm_globals.spawn_in_progress = false;
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we are timing, record the start time */
|
/* if we are timing, record the start time */
|
||||||
@ -1272,6 +1285,17 @@ launch_apps:
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* wait for the launch to complete */
|
||||||
|
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
|
||||||
|
while (!orte_plm_globals.spawn_complete) {
|
||||||
|
opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock);
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
|
"completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid)));
|
||||||
|
orte_plm_globals.spawn_in_progress = false;
|
||||||
|
opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond);
|
||||||
|
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||||
|
|
||||||
/* get here if launch went okay */
|
/* get here if launch went okay */
|
||||||
failed_launch = false;
|
failed_launch = false;
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user