1
1

I believe this should fix the race condition that Terry is seeing in the MTT

tests. It appears that nothing in the errmgr was using the mutexes to protect
the odls child list.

This commit was SVN r25062.
Этот коммит содержится в:
Wesley Bland 2011-08-18 14:52:30 +00:00
родитель 08bb7f562e
Коммит a2a20c3766
2 изменённых файлов: 26 добавлений и 0 удалений

Просмотреть файл

@ -33,6 +33,7 @@
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/plm.h" #include "orte/mca/plm/plm.h"
#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/rmaps_types.h"
@ -1066,6 +1067,8 @@ static void failed_start(orte_job_t *jdata)
} }
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
item = next) { item = next) {
@ -1088,6 +1091,9 @@ static void failed_start(orte_job_t *jdata)
} }
} }
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported incomplete start", "%s errmgr:hnp: job %s reported incomplete start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1120,6 +1126,9 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
} }
jobdat->state = jobstate; jobdat->state = jobstate;
jdata->state = jobstate; jdata->state = jobstate;
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
item = next) { item = next) {
@ -1150,6 +1159,10 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
} }
} }
} }
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
} }
void orte_errmgr_hnp_update_proc(orte_job_t *jdata, void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
@ -1178,6 +1191,8 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
} }
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
/*** UPDATE LOCAL CHILD ***/ /*** UPDATE LOCAL CHILD ***/
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
@ -1225,6 +1240,9 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
} }
} }
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
/*** UPDATE REMOTE CHILD ***/ /*** UPDATE REMOTE CHILD ***/
for (i=0; i < jdata->procs->size; i++) { for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {

Просмотреть файл

@ -37,6 +37,7 @@
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/plm/plm_types.h" #include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h" #include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h" #include "orte/mca/sensor/sensor.h"
@ -422,6 +423,7 @@ REPORT_ABORT:
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
/* find this proc in the local children */ /* find this proc in the local children */
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
@ -455,6 +457,7 @@ REPORT_ABORT:
break; break;
} }
} }
/* send it */ /* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -676,6 +679,8 @@ static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
orte_util_set_epoch(name_item, name_item->epoch + 1); orte_util_set_epoch(name_item, name_item->epoch + 1);
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
/* Remove the dead process from my list of children if applicable */ /* Remove the dead process from my list of children if applicable */
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
@ -690,6 +695,9 @@ static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
} }
} }
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
/* Remove the route from the routing layer */ /* Remove the route from the routing layer */
orte_routed.delete_route(name_item); orte_routed.delete_route(name_item);
} }