1
1

Fix a situation where we were unlocking a thread that isn't locked for the main launch - it is only used for dynamic spawns.

This commit was SVN r23682.
Этот коммит содержится в:
Ralph Castain 2010-08-28 14:03:17 +00:00
родитель c0685fc673
Коммит 554aede041
4 изменённых файлов: 32 добавлений и 26 удалений

Просмотреть файл

@ -878,10 +878,11 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
jdata->num_launched++;
} else if (ORTE_PROC_STATE_REGISTERED == state) {
jdata->num_reported++;
if (jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
if (jdata->dyn_spawn_active &&
jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock,
&jdata->dyn_spawn_cond,
&jdata->dyn_spawn_active);
}
}
}
@ -931,10 +932,11 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
}
} else if (ORTE_PROC_STATE_REGISTERED == state) {
jdata->num_reported++;
if (jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
if (jdata->dyn_spawn_active &&
jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock,
&jdata->dyn_spawn_cond,
&jdata->dyn_spawn_active);
}
}
return;
@ -958,10 +960,11 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
proct->exit_code = exit_code;
if (ORTE_PROC_STATE_REGISTERED == state) {
jdata->num_reported++;
if (jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
if (jdata->dyn_spawn_active &&
jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock,
&jdata->dyn_spawn_cond,
&jdata->dyn_spawn_active);
}
} else if (ORTE_PROC_STATE_UNTERMINATED < state) {
/* update the counter so we can terminate */

Просмотреть файл

@ -196,6 +196,9 @@ static void process_msg(int fd, short event, void *data)
goto ANSWER_LAUNCH;
}
/* flag that this is a dynamic spawn */
jdata->dyn_spawn_active = true;
/* if is a LOCAL slave cmd */
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
@ -299,10 +302,10 @@ static void process_msg(int fd, short event, void *data)
/* we will wait here until the thread is released,
* indicating that all procs have reported
*/
OPAL_ACQUIRE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
OPAL_THREAD_UNLOCK(&jdata->reported_lock);
OPAL_ACQUIRE_THREAD(&jdata->dyn_spawn_lock,
&jdata->dyn_spawn_cond,
&jdata->dyn_spawn_active);
OPAL_THREAD_UNLOCK(&jdata->dyn_spawn_lock);
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
}

Просмотреть файл

@ -655,9 +655,9 @@ static void orte_job_construct(orte_job_t* job)
job->abort = false;
job->aborted_proc = NULL;
OBJ_CONSTRUCT(&job->reported_lock, opal_mutex_t);
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
job->not_reported = true;
OBJ_CONSTRUCT(&job->dyn_spawn_lock, opal_mutex_t);
OBJ_CONSTRUCT(&job->dyn_spawn_cond, opal_condition_t);
job->dyn_spawn_active = false;
job->enable_recovery = false;
@ -719,8 +719,8 @@ static void orte_job_destruct(orte_job_t* job)
}
OBJ_RELEASE(job->procs);
OBJ_DESTRUCT(&job->reported_lock);
OBJ_DESTRUCT(&job->reported_cond);
OBJ_DESTRUCT(&job->dyn_spawn_lock);
OBJ_DESTRUCT(&job->dyn_spawn_cond);
#if OPAL_ENABLE_FT_CR == 1
if (NULL != job->ckpt_snapshot_ref) {

Просмотреть файл

@ -414,10 +414,10 @@ typedef struct {
orte_vpid_t num_terminated;
/* number of daemons reported launched so we can track progress */
orte_vpid_t num_daemons_reported;
/* lock/cond/flag for tracking when all procs reported */
opal_mutex_t reported_lock;
opal_condition_t reported_cond;
bool not_reported;
/* lock/cond/flag for tracking when all procs reported on dynamic spawn */
opal_mutex_t dyn_spawn_lock;
opal_condition_t dyn_spawn_cond;
bool dyn_spawn_active;
/* did this job abort? */
bool abort;
/* proc that caused that to happen */