Merge pull request #331 from miked-mellanox/topic/fix_mkey_recursion_master
fix infinite recursion during mkey exchange at scale
Этот коммит содержится в:
Коммит
0e4ce91f5f
@ -340,6 +340,7 @@ static inline int my_MPI_Test(ompi_request_t ** rptr,
|
|||||||
ompi_request_t *request = *rptr;
|
ompi_request_t *request = *rptr;
|
||||||
|
|
||||||
assert(request->req_persistent);
|
assert(request->req_persistent);
|
||||||
|
assert(request->req_state != OMPI_REQUEST_INACTIVE);
|
||||||
|
|
||||||
if (request->req_complete) {
|
if (request->req_complete) {
|
||||||
int old_error;
|
int old_error;
|
||||||
@ -371,7 +372,7 @@ static int oshmem_mkey_recv_cb(void)
|
|||||||
n = 0;
|
n = 0;
|
||||||
r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
|
r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
|
||||||
assert(r);
|
assert(r);
|
||||||
while (1) {
|
while(r != opal_list_get_end(&memheap_oob.req_list)) {
|
||||||
my_MPI_Test(&r->recv_req, &flag, &status);
|
my_MPI_Test(&r->recv_req, &flag, &status);
|
||||||
if (OPAL_LIKELY(0 == flag)) {
|
if (OPAL_LIKELY(0 == flag)) {
|
||||||
return n;
|
return n;
|
||||||
@ -400,6 +401,15 @@ static int oshmem_mkey_recv_cb(void)
|
|||||||
}
|
}
|
||||||
opal_dss.load(msg, (void*)tmp_buf, size);
|
opal_dss.load(msg, (void*)tmp_buf, size);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* send reply before posting the receive request again to limit the recursion size to
|
||||||
|
* number of receive requests.
|
||||||
|
* send can call opal_progress which calls this function again. If recv req is started
|
||||||
|
* stack size will be proportional to number of job ranks.
|
||||||
|
*/
|
||||||
|
do_recv(status.MPI_SOURCE, msg);
|
||||||
|
OBJ_RELEASE(msg);
|
||||||
|
|
||||||
rc = MPI_Start(&r->recv_req);
|
rc = MPI_Start(&r->recv_req);
|
||||||
if (MPI_SUCCESS != rc) {
|
if (MPI_SUCCESS != rc) {
|
||||||
MEMHEAP_ERROR("Failed to post recv request %d", rc);
|
MEMHEAP_ERROR("Failed to post recv request %d", rc);
|
||||||
@ -408,8 +418,6 @@ static int oshmem_mkey_recv_cb(void)
|
|||||||
}
|
}
|
||||||
opal_list_append(&memheap_oob.req_list, &r->super);
|
opal_list_append(&memheap_oob.req_list, &r->super);
|
||||||
|
|
||||||
do_recv(status.MPI_SOURCE, msg);
|
|
||||||
OBJ_RELEASE(msg);
|
|
||||||
|
|
||||||
r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
|
r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
|
||||||
assert(r);
|
assert(r);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user