The RML posted recvs are controlled by the async progress thread when in an application process. The call to finalize and close the RML is done from the main thread, and so we need to shift the actual destruct of the posted recv list to the async thread for handling or else we encounter a race condition when accessing the posted recvs.
Thanks to Gilles for providing the required debug info
Этот коммит содержится в:
родитель
ec3a38384f
Коммит
4853457b93
@ -26,6 +26,7 @@
|
|||||||
|
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
@ -74,14 +75,36 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void cleanup(int sd, short args, void *cbdata)
|
||||||
|
{
|
||||||
|
bool *active = (bool*)cbdata;
|
||||||
|
|
||||||
|
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
|
||||||
|
if (NULL != active) {
|
||||||
|
*active = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static int orte_rml_base_close(void)
|
static int orte_rml_base_close(void)
|
||||||
{
|
{
|
||||||
opal_list_item_t *item;
|
bool active;
|
||||||
|
|
||||||
while (NULL != (item = opal_list_remove_first(&orte_rml_base.posted_recvs))) {
|
/* because the RML posted recvs list is in a separate
|
||||||
OBJ_RELEASE(item);
|
* async thread for apps, we can't just destruct it here.
|
||||||
}
|
* Instead, we push it into that event thread and destruct
|
||||||
OBJ_DESTRUCT(&orte_rml_base.posted_recvs);
|
* it there */
|
||||||
|
if (ORTE_PROC_IS_APP) {
|
||||||
|
opal_event_t ev;
|
||||||
|
active = true;
|
||||||
|
opal_event_set(orte_event_base, &ev, -1,
|
||||||
|
OPAL_EV_WRITE, cleanup, &active);
|
||||||
|
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
||||||
|
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
||||||
|
ORTE_WAIT_FOR_COMPLETION(active);
|
||||||
|
} else {
|
||||||
|
/* we can call the destruct directly */
|
||||||
|
cleanup(0, 0, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml);
|
OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml);
|
||||||
OBJ_DESTRUCT(&orte_rml_base.open_channels);
|
OBJ_DESTRUCT(&orte_rml_base.open_channels);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user