1
1

Merge pull request #5500 from tkordenbrock/topic/master/fix.PtlMEUnlink.in.use

coll-portals4: retry PtlMEUnlink() if PTL_IN_USE
Этот коммит содержится в:
Todd Kordenbrock 2018-08-07 11:21:00 -05:00 коммит произвёл GitHub
родитель c294bbc352 f3f2a826b4
Коммит e9f378e851
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 62 добавлений и 27 удалений

Просмотреть файл

@ -343,15 +343,38 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
static int static int
allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request) allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
{ {
int ret;
if (request->u.allreduce.is_optim) { if (request->u.allreduce.is_optim) {
PtlAtomicSync(); PtlAtomicSync();
if (request->u.allreduce.child_nb) { if (request->u.allreduce.child_nb) {
PtlCTFree(request->u.allreduce.ack_ct_h); ret = PtlCTFree(request->u.allreduce.ack_ct_h);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlCTFree failed: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
} }
PtlMEUnlink(request->u.allreduce.data_me_h); do {
PtlCTFree(request->u.allreduce.trig_ct_h); ret = PtlMEUnlink(request->u.allreduce.data_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
ret = PtlCTFree(request->u.allreduce.trig_ct_h);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlCTFree failed: %d\n",
__FILE__, __LINE__, ret);
return OMPI_ERROR;
}
} }
return (OMPI_SUCCESS); return (OMPI_SUCCESS);

Просмотреть файл

@ -206,7 +206,9 @@ barrier_hypercube_bottom(ompi_coll_portals4_request_t *request)
int ret; int ret;
/* cleanup */ /* cleanup */
ret = PtlMEUnlink(request->u.barrier.data_me_h); do {
ret = PtlMEUnlink(request->u.barrier.data_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output, opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n", "%s:%d: PtlMEUnlink failed: %d\n",

Просмотреть файл

@ -285,7 +285,9 @@ portals4_close(void)
mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE; mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE;
if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) { if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) {
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h); do {
ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output, opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n", "%s:%d: PtlMEUnlink failed: %d\n",
@ -293,7 +295,9 @@ portals4_close(void)
} }
} }
if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) { if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) {
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h); do {
ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { if (PTL_OK != ret) {
opal_output_verbose(1, ompi_coll_base_framework.framework_output, opal_output_verbose(1, ompi_coll_base_framework.framework_output,
"%s:%d: PtlMEUnlink failed: %d\n", "%s:%d: PtlMEUnlink failed: %d\n",

Просмотреть файл

@ -460,7 +460,9 @@ cleanup_gather_handles(ompi_coll_portals4_request_t *request)
/**********************************/ /**********************************/
/* Cleanup Gather Handles */ /* Cleanup Gather Handles */
/**********************************/ /**********************************/
ret = PtlMEUnlink(request->u.gather.gather_meh); do {
ret = PtlMEUnlink(request->u.gather.gather_meh);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
ret = PtlCTFree(request->u.gather.gather_cth); ret = PtlCTFree(request->u.gather.gather_cth);
@ -484,7 +486,9 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
/**********************************/ /**********************************/
/* Cleanup Sync Handles */ /* Cleanup Sync Handles */
/**********************************/ /**********************************/
ret = PtlMEUnlink(request->u.gather.sync_meh); do {
ret = PtlMEUnlink(request->u.gather.sync_meh);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
ret = PtlCTFree(request->u.gather.sync_cth); ret = PtlCTFree(request->u.gather.sync_cth);

Просмотреть файл

@ -340,24 +340,38 @@ reduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
static int static int
reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request) reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request)
{ {
int ret, line;
if (request->u.reduce.is_optim) { if (request->u.reduce.is_optim) {
PtlAtomicSync(); PtlAtomicSync();
if (request->u.reduce.use_ack_ct_h) { if (request->u.reduce.use_ack_ct_h) {
PtlCTFree(request->u.reduce.ack_ct_h); ret = PtlCTFree(request->u.reduce.ack_ct_h);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
} }
if (request->u.reduce.child_nb) { if (request->u.reduce.child_nb) {
PtlMEUnlink(request->u.reduce.data_me_h); do {
ret = PtlMEUnlink(request->u.reduce.data_me_h);
} while (PTL_IN_USE == ret);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
} }
PtlCTFree(request->u.reduce.trig_ct_h); ret = PtlCTFree(request->u.reduce.trig_ct_h);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
if (request->u.reduce.free_buffer) { if (request->u.reduce.free_buffer) {
free(request->u.reduce.free_buffer); free(request->u.reduce.free_buffer);
} }
} }
return (OMPI_SUCCESS); return (OMPI_SUCCESS);
err_hdlr:
opal_output(ompi_coll_base_framework.framework_output,
"%s:%4d:%4d\tError occurred ret=%d",
__FILE__, __LINE__, line, ret);
return ret;
} }

Просмотреть файл

@ -253,14 +253,8 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request)
/**********************************/ /**********************************/
do { do {
ret = PtlMEUnlink(request->u.scatter.scatter_meh); ret = PtlMEUnlink(request->u.scatter.scatter_meh);
if (PTL_IN_USE == ret) { } while (PTL_IN_USE == ret);
opal_output(ompi_coll_base_framework.framework_output, if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
"%s:%4d: scatter_meh still in use (ret=%d, rank %2d)",
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
continue;
}
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
} while (ret == PTL_IN_USE);
ret = PtlCTFree(request->u.scatter.scatter_cth); ret = PtlCTFree(request->u.scatter.scatter_cth);
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
@ -292,14 +286,8 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request)
/**********************************/ /**********************************/
do { do {
ret = PtlMEUnlink(request->u.scatter.sync_meh); ret = PtlMEUnlink(request->u.scatter.sync_meh);
if (PTL_IN_USE == ret) { } while (PTL_IN_USE == ret);
opal_output(ompi_coll_base_framework.framework_output, if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
"%s:%4d: sync_meh still in use (ret=%d, rank %2d)",
__FILE__, __LINE__, ret, request->u.scatter.my_rank);
continue;
}
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
} while (ret == PTL_IN_USE);
ret = PtlCTFree(request->u.scatter.sync_cth); ret = PtlCTFree(request->u.scatter.sync_cth);
if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }