Fix a situation where we were unlocking a thread that isn't locked for the main launch - it is only used for dynamic spawns.
This commit was SVN r23682.
Этот коммит содержится в:
родитель
c0685fc673
Коммит
554aede041
@ -878,10 +878,11 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
|
|||||||
jdata->num_launched++;
|
jdata->num_launched++;
|
||||||
} else if (ORTE_PROC_STATE_REGISTERED == state) {
|
} else if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||||
jdata->num_reported++;
|
jdata->num_reported++;
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
if (jdata->dyn_spawn_active &&
|
||||||
OPAL_RELEASE_THREAD(&jdata->reported_lock,
|
jdata->num_reported == jdata->num_procs) {
|
||||||
&jdata->reported_cond,
|
OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock,
|
||||||
&jdata->not_reported);
|
&jdata->dyn_spawn_cond,
|
||||||
|
&jdata->dyn_spawn_active);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -931,10 +932,11 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
} else if (ORTE_PROC_STATE_REGISTERED == state) {
|
} else if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||||
jdata->num_reported++;
|
jdata->num_reported++;
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
if (jdata->dyn_spawn_active &&
|
||||||
OPAL_RELEASE_THREAD(&jdata->reported_lock,
|
jdata->num_reported == jdata->num_procs) {
|
||||||
&jdata->reported_cond,
|
OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock,
|
||||||
&jdata->not_reported);
|
&jdata->dyn_spawn_cond,
|
||||||
|
&jdata->dyn_spawn_active);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -958,10 +960,11 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
|||||||
proct->exit_code = exit_code;
|
proct->exit_code = exit_code;
|
||||||
if (ORTE_PROC_STATE_REGISTERED == state) {
|
if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||||
jdata->num_reported++;
|
jdata->num_reported++;
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
if (jdata->dyn_spawn_active &&
|
||||||
OPAL_RELEASE_THREAD(&jdata->reported_lock,
|
jdata->num_reported == jdata->num_procs) {
|
||||||
&jdata->reported_cond,
|
OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock,
|
||||||
&jdata->not_reported);
|
&jdata->dyn_spawn_cond,
|
||||||
|
&jdata->dyn_spawn_active);
|
||||||
}
|
}
|
||||||
} else if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
} else if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||||
/* update the counter so we can terminate */
|
/* update the counter so we can terminate */
|
||||||
|
@ -195,7 +195,10 @@ static void process_msg(int fd, short event, void *data)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto ANSWER_LAUNCH;
|
goto ANSWER_LAUNCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* flag that this is a dynamic spawn */
|
||||||
|
jdata->dyn_spawn_active = true;
|
||||||
|
|
||||||
/* if is a LOCAL slave cmd */
|
/* if is a LOCAL slave cmd */
|
||||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
@ -299,10 +302,10 @@ static void process_msg(int fd, short event, void *data)
|
|||||||
/* we will wait here until the thread is released,
|
/* we will wait here until the thread is released,
|
||||||
* indicating that all procs have reported
|
* indicating that all procs have reported
|
||||||
*/
|
*/
|
||||||
OPAL_ACQUIRE_THREAD(&jdata->reported_lock,
|
OPAL_ACQUIRE_THREAD(&jdata->dyn_spawn_lock,
|
||||||
&jdata->reported_cond,
|
&jdata->dyn_spawn_cond,
|
||||||
&jdata->not_reported);
|
&jdata->dyn_spawn_active);
|
||||||
OPAL_THREAD_UNLOCK(&jdata->reported_lock);
|
OPAL_THREAD_UNLOCK(&jdata->dyn_spawn_lock);
|
||||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -655,9 +655,9 @@ static void orte_job_construct(orte_job_t* job)
|
|||||||
job->abort = false;
|
job->abort = false;
|
||||||
job->aborted_proc = NULL;
|
job->aborted_proc = NULL;
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&job->reported_lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&job->dyn_spawn_lock, opal_mutex_t);
|
||||||
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
|
OBJ_CONSTRUCT(&job->dyn_spawn_cond, opal_condition_t);
|
||||||
job->not_reported = true;
|
job->dyn_spawn_active = false;
|
||||||
|
|
||||||
job->enable_recovery = false;
|
job->enable_recovery = false;
|
||||||
|
|
||||||
@ -719,8 +719,8 @@ static void orte_job_destruct(orte_job_t* job)
|
|||||||
}
|
}
|
||||||
OBJ_RELEASE(job->procs);
|
OBJ_RELEASE(job->procs);
|
||||||
|
|
||||||
OBJ_DESTRUCT(&job->reported_lock);
|
OBJ_DESTRUCT(&job->dyn_spawn_lock);
|
||||||
OBJ_DESTRUCT(&job->reported_cond);
|
OBJ_DESTRUCT(&job->dyn_spawn_cond);
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_CR == 1
|
#if OPAL_ENABLE_FT_CR == 1
|
||||||
if (NULL != job->ckpt_snapshot_ref) {
|
if (NULL != job->ckpt_snapshot_ref) {
|
||||||
|
@ -414,10 +414,10 @@ typedef struct {
|
|||||||
orte_vpid_t num_terminated;
|
orte_vpid_t num_terminated;
|
||||||
/* number of daemons reported launched so we can track progress */
|
/* number of daemons reported launched so we can track progress */
|
||||||
orte_vpid_t num_daemons_reported;
|
orte_vpid_t num_daemons_reported;
|
||||||
/* lock/cond/flag for tracking when all procs reported */
|
/* lock/cond/flag for tracking when all procs reported on dynamic spawn */
|
||||||
opal_mutex_t reported_lock;
|
opal_mutex_t dyn_spawn_lock;
|
||||||
opal_condition_t reported_cond;
|
opal_condition_t dyn_spawn_cond;
|
||||||
bool not_reported;
|
bool dyn_spawn_active;
|
||||||
/* did this job abort? */
|
/* did this job abort? */
|
||||||
bool abort;
|
bool abort;
|
||||||
/* proc that caused that to happen */
|
/* proc that caused that to happen */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user