1
1

Merge pull request #5523 from tkordenbrock/topic/v4.0.x/fix.PtlMEUnlink.in.use

v4.0.x: coll-portals4: retry PtlMEUnlink() if PTL_IN_USE
Этот коммит содержится в:
Howard Pritchard 2018-08-13 14:19:10 -06:00 коммит произвёл GitHub
родитель 7b6a2da71a 36369f9133
Коммит cdc315c1ac
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 62 добавлений и 27 удалений

Просмотреть файл

@ -343,15 +343,38 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
static int
allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
{
int ret;
if (request->u.allreduce.is_optim) {
PtlAtomicSync();
if (request->u.allreduce.child_nb) {
PtlCTFree(request->u.allreduce.ack_ct_h);
ret = PtlCTFree(request->u.allreduce.ack_ct_h);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlCTFree failed: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
}
PtlMEUnlink(request->u.allreduce.data_me_h);
PtlCTFree(request->u.allreduce.trig_ct_h);
do {
ret = PtlMEUnlink(request->u.allreduce.data_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
ret = PtlCTFree(request->u.allreduce.trig_ct_h);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlCTFree failed: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
}
return (OMPI_SUCCESS);

Просмотреть файл

@ -206,7 +206,9 @@ barrier_hypercube_bottom(ompi_coll_portals4_request_t *request)
int ret;
/* cleanup */
ret = PtlMEUnlink(request->u.barrier.data_me_h);
do {
ret = PtlMEUnlink(request->u.barrier.data_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n",

Просмотреть файл

@ -285,7 +285,9 @@ portals4_close(void)
mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE;
if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) {
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
do {
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n",
@ -293,7 +295,9 @@ portals4_close(void)
}
}
if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) {
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
do {
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n",

Просмотреть файл

@ -460,7 +460,9 @@ cleanup_gather_handles(ompi_coll_portals4_request_t *request)
/**********************************/
/* Cleanup Gather Handles */
/**********************************/
ret = PtlMEUnlink(request->u.gather.gather_meh);
do {
ret = PtlMEUnlink(request->u.gather.gather_meh);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
ret = PtlCTFree(request->u.gather.gather_cth);
@ -484,7 +486,9 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
/**********************************/
/* Cleanup Sync Handles */
/**********************************/
ret = PtlMEUnlink(request->u.gather.sync_meh);
do {
ret = PtlMEUnlink(request->u.gather.sync_meh);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
ret = PtlCTFree(request->u.gather.sync_cth);

Просмотреть файл

@ -340,24 +340,38 @@ reduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
static int
reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
{
int ret, line;
if (request->u.reduce.is_optim) {
PtlAtomicSync();
if (request->u.reduce.use_ack_ct_h) {
PtlCTFree(request->u.reduce.ack_ct_h);
ret = PtlCTFree(request->u.reduce.ack_ct_h);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
}
if (request->u.reduce.child_nb) {
PtlMEUnlink(request->u.reduce.data_me_h);
do {
ret = PtlMEUnlink(request->u.reduce.data_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
}
PtlCTFree(request->u.reduce.trig_ct_h);
ret = PtlCTFree(request->u.reduce.trig_ct_h);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
if (request->u.reduce.free_buffer) {
free(request->u.reduce.free_buffer);
}
}
return (OMPI_SUCCESS);
err_hdlr:
opal_output(ompi_coll_base_framework.framework_output,
"%s:%4d:%4d\tError occurred ret=%d",
__FILE__, __LINE__, line, ret);
return ret;
}

Просмотреть файл

@ -253,14 +253,8 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request)
/**********************************/
do {
ret = PtlMEUnlink(request->u.scatter.scatter_meh);
if (PTL_IN_USE == ret) {
opal_output(ompi_coll_base_framework.framework_output,
"%s:%4d: scatter_meh still in use (ret=%d, rank %2d)",
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
continue;
}
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
} while (ret == PTL_IN_USE);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
ret = PtlCTFree(request->u.scatter.scatter_cth);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
@ -292,14 +286,8 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
/**********************************/
do {
ret = PtlMEUnlink(request->u.scatter.sync_meh);
if (PTL_IN_USE == ret) {
opal_output(ompi_coll_base_framework.framework_output,
"%s:%4d: sync_meh still in use (ret=%d, rank %2d)",
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
continue;
}
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
} while (ret == PTL_IN_USE);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
ret = PtlCTFree(request->u.scatter.sync_cth);
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }