osc/rdma: fix data race on teardown
The osc/rdma module did not wait for all pending atomics to complete before tearing down. This could lead to weird issues as the target location may no longer be registered or allocated. This commit also fixes an offset calculation issue in ompi_osc_get_data_blocking (). Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
c9e58cedc1
Коммит
e4989714c2
@ -269,6 +269,9 @@ struct ompi_osc_rdma_module_t {
|
||||
|
||||
/** number of time a get had to be retried */
|
||||
unsigned long get_retry_count;
|
||||
|
||||
/** outstanding atomic operations */
|
||||
volatile int32_t pending_ops;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
|
||||
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
|
||||
|
@ -56,6 +56,7 @@ static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pend
|
||||
pending_op->op_result = NULL;
|
||||
pending_op->op_complete = false;
|
||||
pending_op->cbfunc = NULL;
|
||||
pending_op->module = NULL;
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op)
|
||||
@ -64,6 +65,10 @@ static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pendi
|
||||
ompi_osc_rdma_frag_complete (pending_op->op_frag);
|
||||
}
|
||||
|
||||
if (NULL != pending_op->module) {
|
||||
(void) opal_atomic_fetch_add_32 (&pending_op->module->pending_ops, -1);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_pending_op_construct (pending_op);
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
volatile bool read_complete = false;
|
||||
size_t aligned_len, offset;
|
||||
uint64_t aligned_addr = (source_address + btl_alignment_mask) & ~btl_alignment_mask;
|
||||
uint64_t aligned_addr = source_address & ~btl_alignment_mask;
|
||||
char *ptr = data;
|
||||
int ret;
|
||||
|
||||
|
@ -47,6 +47,10 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, struct
|
||||
|
||||
if (wait_for_completion) {
|
||||
OBJ_RETAIN(pending_op);
|
||||
} else {
|
||||
/* NTH: need to keep track of pending ops to avoid a potential teardown problem */
|
||||
pending_op->module = module;
|
||||
(void) opal_atomic_fetch_add_32 (&module->pending_ops, 1);
|
||||
}
|
||||
|
||||
pending_op->op_result = (void *) result;
|
||||
@ -130,6 +134,12 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, struct m
|
||||
pending_op->cbcontext = cbcontext;
|
||||
}
|
||||
|
||||
if (!wait_for_completion) {
|
||||
/* NTH: need to keep track of pending ops to avoid a potential teardown problem */
|
||||
pending_op->module = module;
|
||||
(void) opal_atomic_fetch_add_32 (&module->pending_ops, 1);
|
||||
}
|
||||
|
||||
/* spin until the btl has accepted the operation */
|
||||
do {
|
||||
ret = module->selected_btl->btl_atomic_op (module->selected_btl, endpoint, (intptr_t) address, address_handle,
|
||||
|
@ -51,6 +51,10 @@ int ompi_osc_rdma_free(ompi_win_t *win)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
while (module->pending_ops) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
|
||||
if (NULL != module->comm) {
|
||||
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
|
||||
"rdma component destroying window with id %d",
|
||||
|
@ -209,6 +209,7 @@ typedef void (*ompi_osc_rdma_pending_op_cb_fn_t) (void *, void *, int);
|
||||
|
||||
struct ompi_osc_rdma_pending_op_t {
|
||||
opal_list_item_t super;
|
||||
struct ompi_osc_rdma_module_t *module;
|
||||
struct ompi_osc_rdma_frag_t *op_frag;
|
||||
void *op_buffer;
|
||||
void *op_result;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user