Merge pull request #5500 from tkordenbrock/topic/master/fix.PtlMEUnlink.in.use
coll-portals4: retry PtlMEUnlink() if PTL_IN_USE
Этот коммит содержится в:
Коммит
e9f378e851
@ -343,15 +343,38 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
|
|||||||
static int
|
static int
|
||||||
allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
|
allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
|
||||||
{
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
if (request->u.allreduce.is_optim) {
|
if (request->u.allreduce.is_optim) {
|
||||||
PtlAtomicSync();
|
PtlAtomicSync();
|
||||||
|
|
||||||
if (request->u.allreduce.child_nb) {
|
if (request->u.allreduce.child_nb) {
|
||||||
PtlCTFree(request->u.allreduce.ack_ct_h);
|
ret = PtlCTFree(request->u.allreduce.ack_ct_h);
|
||||||
|
if (PTL_OK != ret) {
|
||||||
|
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||||
|
"%s:%d: PtlCTFree failed: %d\n",
|
||||||
|
__FILE__, __LINE__, ret);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PtlMEUnlink(request->u.allreduce.data_me_h);
|
do {
|
||||||
PtlCTFree(request->u.allreduce.trig_ct_h);
|
ret = PtlMEUnlink(request->u.allreduce.data_me_h);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
|
if (PTL_OK != ret) {
|
||||||
|
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||||
|
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||||
|
__FILE__, __LINE__, ret);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = PtlCTFree(request->u.allreduce.trig_ct_h);
|
||||||
|
if (PTL_OK != ret) {
|
||||||
|
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||||
|
"%s:%d: PtlCTFree failed: %d\n",
|
||||||
|
__FILE__, __LINE__, ret);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (OMPI_SUCCESS);
|
return (OMPI_SUCCESS);
|
||||||
|
@ -206,7 +206,9 @@ barrier_hypercube_bottom(ompi_coll_portals4_request_t *request)
|
|||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
ret = PtlMEUnlink(request->u.barrier.data_me_h);
|
do {
|
||||||
|
ret = PtlMEUnlink(request->u.barrier.data_me_h);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
if (PTL_OK != ret) {
|
if (PTL_OK != ret) {
|
||||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||||
|
@ -285,7 +285,9 @@ portals4_close(void)
|
|||||||
mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE;
|
mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE;
|
||||||
|
|
||||||
if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) {
|
if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) {
|
||||||
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
|
do {
|
||||||
|
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
if (PTL_OK != ret) {
|
if (PTL_OK != ret) {
|
||||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||||
@ -293,7 +295,9 @@ portals4_close(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) {
|
if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) {
|
||||||
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
|
do {
|
||||||
|
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
if (PTL_OK != ret) {
|
if (PTL_OK != ret) {
|
||||||
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
|
||||||
"%s:%d: PtlMEUnlink failed: %d\n",
|
"%s:%d: PtlMEUnlink failed: %d\n",
|
||||||
|
@ -460,7 +460,9 @@ cleanup_gather_handles(ompi_coll_portals4_request_t *request)
|
|||||||
/**********************************/
|
/**********************************/
|
||||||
/* Cleanup Gather Handles */
|
/* Cleanup Gather Handles */
|
||||||
/**********************************/
|
/**********************************/
|
||||||
ret = PtlMEUnlink(request->u.gather.gather_meh);
|
do {
|
||||||
|
ret = PtlMEUnlink(request->u.gather.gather_meh);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
|
|
||||||
ret = PtlCTFree(request->u.gather.gather_cth);
|
ret = PtlCTFree(request->u.gather.gather_cth);
|
||||||
@ -484,7 +486,9 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
|
|||||||
/**********************************/
|
/**********************************/
|
||||||
/* Cleanup Sync Handles */
|
/* Cleanup Sync Handles */
|
||||||
/**********************************/
|
/**********************************/
|
||||||
ret = PtlMEUnlink(request->u.gather.sync_meh);
|
do {
|
||||||
|
ret = PtlMEUnlink(request->u.gather.sync_meh);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
|
|
||||||
ret = PtlCTFree(request->u.gather.sync_cth);
|
ret = PtlCTFree(request->u.gather.sync_cth);
|
||||||
|
@ -340,24 +340,38 @@ reduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
|
|||||||
static int
|
static int
|
||||||
reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
|
reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
|
||||||
{
|
{
|
||||||
|
int ret, line;
|
||||||
|
|
||||||
if (request->u.reduce.is_optim) {
|
if (request->u.reduce.is_optim) {
|
||||||
PtlAtomicSync();
|
PtlAtomicSync();
|
||||||
|
|
||||||
if (request->u.reduce.use_ack_ct_h) {
|
if (request->u.reduce.use_ack_ct_h) {
|
||||||
PtlCTFree(request->u.reduce.ack_ct_h);
|
ret = PtlCTFree(request->u.reduce.ack_ct_h);
|
||||||
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (request->u.reduce.child_nb) {
|
if (request->u.reduce.child_nb) {
|
||||||
PtlMEUnlink(request->u.reduce.data_me_h);
|
do {
|
||||||
|
ret = PtlMEUnlink(request->u.reduce.data_me_h);
|
||||||
|
} while (PTL_IN_USE == ret);
|
||||||
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
}
|
}
|
||||||
|
|
||||||
PtlCTFree(request->u.reduce.trig_ct_h);
|
ret = PtlCTFree(request->u.reduce.trig_ct_h);
|
||||||
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
|
|
||||||
if (request->u.reduce.free_buffer) {
|
if (request->u.reduce.free_buffer) {
|
||||||
free(request->u.reduce.free_buffer);
|
free(request->u.reduce.free_buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (OMPI_SUCCESS);
|
return (OMPI_SUCCESS);
|
||||||
|
|
||||||
|
err_hdlr:
|
||||||
|
opal_output(ompi_coll_base_framework.framework_output,
|
||||||
|
"%s:%4d:%4d\tError occurred ret=%d",
|
||||||
|
__FILE__, __LINE__, line, ret);
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -253,14 +253,8 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request)
|
|||||||
/**********************************/
|
/**********************************/
|
||||||
do {
|
do {
|
||||||
ret = PtlMEUnlink(request->u.scatter.scatter_meh);
|
ret = PtlMEUnlink(request->u.scatter.scatter_meh);
|
||||||
if (PTL_IN_USE == ret) {
|
} while (PTL_IN_USE == ret);
|
||||||
opal_output(ompi_coll_base_framework.framework_output,
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
"%s:%4d: scatter_meh still in use (ret=%d, rank %2d)",
|
|
||||||
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
|
||||||
} while (ret == PTL_IN_USE);
|
|
||||||
|
|
||||||
ret = PtlCTFree(request->u.scatter.scatter_cth);
|
ret = PtlCTFree(request->u.scatter.scatter_cth);
|
||||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
@ -292,14 +286,8 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
|
|||||||
/**********************************/
|
/**********************************/
|
||||||
do {
|
do {
|
||||||
ret = PtlMEUnlink(request->u.scatter.sync_meh);
|
ret = PtlMEUnlink(request->u.scatter.sync_meh);
|
||||||
if (PTL_IN_USE == ret) {
|
} while (PTL_IN_USE == ret);
|
||||||
opal_output(ompi_coll_base_framework.framework_output,
|
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
"%s:%4d: sync_meh still in use (ret=%d, rank %2d)",
|
|
||||||
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
|
||||||
} while (ret == PTL_IN_USE);
|
|
||||||
|
|
||||||
ret = PtlCTFree(request->u.scatter.sync_cth);
|
ret = PtlCTFree(request->u.scatter.sync_cth);
|
||||||
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user