1
1

osc/rdma: fix data race on teardown

The osc/rdma module did not wait for all pending atomics to complete
before tearing down. This could lead to weird issues as the target
location may no longer be registered or allocated.

This commit also fixes an offset calculation issue in
ompi_osc_get_data_blocking ().

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2018-06-25 09:47:44 -06:00 коммит произвёл Nathan Hjelm
родитель c9e58cedc1
Коммит e4989714c2
6 изменённых файлов: 24 добавлений и 1 удалений

Просмотреть файл

@ -269,6 +269,9 @@ struct ompi_osc_rdma_module_t {
/** number of time a get had to be retried */
unsigned long get_retry_count;
/** outstanding atomic operations */
volatile int32_t pending_ops;
};
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;

Просмотреть файл

@ -56,6 +56,7 @@ static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pend
pending_op->op_result = NULL;
pending_op->op_complete = false;
pending_op->cbfunc = NULL;
pending_op->module = NULL;
}
static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op)
@ -64,6 +65,10 @@ static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pendi
ompi_osc_rdma_frag_complete (pending_op->op_frag);
}
if (NULL != pending_op->module) {
(void) opal_atomic_fetch_add_32 (&pending_op->module->pending_ops, -1);
}
ompi_osc_rdma_pending_op_construct (pending_op);
}

Просмотреть файл

@ -63,7 +63,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b
ompi_osc_rdma_frag_t *frag = NULL;
volatile bool read_complete = false;
size_t aligned_len, offset;
uint64_t aligned_addr = (source_address + btl_alignment_mask) & ~btl_alignment_mask;
uint64_t aligned_addr = source_address & ~btl_alignment_mask;
char *ptr = data;
int ret;

Просмотреть файл

@ -47,6 +47,10 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, struct
if (wait_for_completion) {
OBJ_RETAIN(pending_op);
} else {
/* NTH: need to keep track of pending ops to avoid a potential teardown problem */
pending_op->module = module;
(void) opal_atomic_fetch_add_32 (&module->pending_ops, 1);
}
pending_op->op_result = (void *) result;
@ -130,6 +134,12 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, struct m
pending_op->cbcontext = cbcontext;
}
if (!wait_for_completion) {
/* NTH: need to keep track of pending ops to avoid a potential teardown problem */
pending_op->module = module;
(void) opal_atomic_fetch_add_32 (&module->pending_ops, 1);
}
/* spin until the btl has accepted the operation */
do {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, endpoint, (intptr_t) address, address_handle,

Просмотреть файл

@ -51,6 +51,10 @@ int ompi_osc_rdma_free(ompi_win_t *win)
return OMPI_SUCCESS;
}
while (module->pending_ops) {
ompi_osc_rdma_progress (module);
}
if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"rdma component destroying window with id %d",

Просмотреть файл

@ -209,6 +209,7 @@ typedef void (*ompi_osc_rdma_pending_op_cb_fn_t) (void *, void *, int);
struct ompi_osc_rdma_pending_op_t {
opal_list_item_t super;
struct ompi_osc_rdma_module_t *module;
struct ompi_osc_rdma_frag_t *op_frag;
void *op_buffer;
void *op_result;