1
1

coll/libnbc: Fix handling of completed request

Because a persistent reuqest does not free its `schedule` object
when the communication completes, the `NBC_Progress` function cannot
determine the completion using `schedule`.

Without this change, a hang occurs when the `NBC_Progress` function
is called recursively through the `NBC_Start_round` function.

Signed-off-by: KAWASHIMA Takahiro <t-kawashima@jp.fujitsu.com>
Этот коммит содержится в:
KAWASHIMA Takahiro 2018-02-20 09:52:25 +09:00
родитель 8e5690bf5c
Коммит 5c5de3a4fb
2 изменённых файлов: 6 добавлений и 3 удалений

Просмотреть файл

@ -117,6 +117,7 @@ struct ompi_coll_libnbc_request_t {
ompi_request_t super;
MPI_Comm comm;
long row_offset;
bool nbc_complete; /* status in libnbc level */
int tag;
volatile int req_count;
ompi_request_t **req_array;

Просмотреть файл

@ -321,8 +321,7 @@ int NBC_Progress(NBC_Handle *handle) {
int i;
ompi_status_public_t status;
/* the handle is done if there is no schedule attached */
if (NULL == handle->schedule) {
if (handle->nbc_complete) {
return NBC_OK;
}
@ -388,8 +387,9 @@ int NBC_Progress(NBC_Handle *handle) {
/* this was the last round - we're done */
NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n");
handle->nbc_complete = true;
if (!handle->super.req_persistent) {
NBC_Free(handle);
NBC_Free(handle);
}
return NBC_OK;
@ -706,6 +706,7 @@ int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm,
handle->comm = comm;
handle->schedule = NULL;
handle->row_offset = 0;
handle->nbc_complete = persistent ? true : false;
/******************** Do the tag and shadow comm administration ... ***************/
@ -811,6 +812,7 @@ int ompi_coll_libnbc_start(size_t count, ompi_request_t ** request) {
handle->super.req_complete = REQUEST_PENDING;
handle->super.req_state = OMPI_REQUEST_ACTIVE;
handle->nbc_complete = false;
res = NBC_Start_internal(handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_DEBUG(5, " ** bad result from NBC_Start_internal **\n");