Merge pull request #5523 from tkordenbrock/topic/v4.0.x/fix.PtlMEUnlink.in.use
v4.0.x: coll-portals4: retry PtlMEUnlink() if PTL_IN_USE
Этот коммит содержится в:
Коммит
cdc315c1ac
@ -343,15 +343,38 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
|
||||
static int
|
||||
allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (request->u.allreduce.is_optim) {
|
||||
PtlAtomicSync();
|
||||
|
||||
if (request->u.allreduce.child_nb) {
|
||||
PtlCTFree(request->u.allreduce.ack_ct_h);
|
||||
ret = PtlCTFree(request->u.allreduce.ack_ct_h);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||
"%s:%d: PtlCTFree failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
PtlMEUnlink(request->u.allreduce.data_me_h);
|
||||
PtlCTFree(request->u.allreduce.trig_ct_h);
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.allreduce.data_me_h);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ret = PtlCTFree(request->u.allreduce.trig_ct_h);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||
"%s:%d: PtlCTFree failed: %d\n",
|
||||
__FILE__, __LINE__, ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return (OMPI_SUCCESS);
|
||||
|
@ -206,7 +206,9 @@ barrier_hypercube_bottom(ompi_coll_portals4_request_t *request)
|
||||
int ret;
|
||||
|
||||
/* cleanup */
|
||||
ret = PtlMEUnlink(request->u.barrier.data_me_h);
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.barrier.data_me_h);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||
|
@ -285,7 +285,9 @@ portals4_close(void)
|
||||
mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE;
|
||||
|
||||
if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) {
|
||||
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
|
||||
do {
|
||||
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||
@ -293,7 +295,9 @@ portals4_close(void)
|
||||
}
|
||||
}
|
||||
if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) {
|
||||
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
|
||||
do {
|
||||
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||
|
@ -460,7 +460,9 @@ cleanup_gather_handles(ompi_coll_portals4_request_t *request)
|
||||
/**********************************/
|
||||
/* Cleanup Gather Handles */
|
||||
/**********************************/
|
||||
ret = PtlMEUnlink(request->u.gather.gather_meh);
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.gather.gather_meh);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
|
||||
ret = PtlCTFree(request->u.gather.gather_cth);
|
||||
@ -484,7 +486,9 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
|
||||
/**********************************/
|
||||
/* Cleanup Sync Handles */
|
||||
/**********************************/
|
||||
ret = PtlMEUnlink(request->u.gather.sync_meh);
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.gather.sync_meh);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
|
||||
ret = PtlCTFree(request->u.gather.sync_cth);
|
||||
|
@ -340,24 +340,38 @@ reduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
|
||||
static int
|
||||
reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
|
||||
{
|
||||
int ret, line;
|
||||
|
||||
if (request->u.reduce.is_optim) {
|
||||
PtlAtomicSync();
|
||||
|
||||
if (request->u.reduce.use_ack_ct_h) {
|
||||
PtlCTFree(request->u.reduce.ack_ct_h);
|
||||
ret = PtlCTFree(request->u.reduce.ack_ct_h);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
}
|
||||
|
||||
if (request->u.reduce.child_nb) {
|
||||
PtlMEUnlink(request->u.reduce.data_me_h);
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.reduce.data_me_h);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
}
|
||||
|
||||
PtlCTFree(request->u.reduce.trig_ct_h);
|
||||
ret = PtlCTFree(request->u.reduce.trig_ct_h);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
|
||||
if (request->u.reduce.free_buffer) {
|
||||
free(request->u.reduce.free_buffer);
|
||||
}
|
||||
}
|
||||
return (OMPI_SUCCESS);
|
||||
|
||||
err_hdlr:
|
||||
opal_output(ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d:%4d\tError occurred ret=%d",
|
||||
__FILE__, __LINE__, line, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
@ -253,14 +253,8 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request)
|
||||
/**********************************/
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.scatter.scatter_meh);
|
||||
if (PTL_IN_USE == ret) {
|
||||
opal_output(ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d: scatter_meh still in use (ret=%d, rank %2d)",
|
||||
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
|
||||
continue;
|
||||
}
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
} while (ret == PTL_IN_USE);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
|
||||
ret = PtlCTFree(request->u.scatter.scatter_cth);
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
@ -292,14 +286,8 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
|
||||
/**********************************/
|
||||
do {
|
||||
ret = PtlMEUnlink(request->u.scatter.sync_meh);
|
||||
if (PTL_IN_USE == ret) {
|
||||
opal_output(ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d: sync_meh still in use (ret=%d, rank %2d)",
|
||||
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
|
||||
continue;
|
||||
}
|
||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
} while (ret == PTL_IN_USE);
|
||||
} while (PTL_IN_USE == ret);
|
||||
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
|
||||
ret = PtlCTFree(request->u.scatter.sync_cth);
|
||||
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user