1
1

Merge pull request #5402 from hoopoepg/topic/common-del-procs

MCA/COMMON/UCX: del_procs calls are unified to common module
Этот коммит содержится в:
Yossi Itigin 2018-07-19 11:19:45 +03:00 коммит произвёл GitHub
родитель b6b9552ca9 920cc2e0d9
Коммит 29812494f2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 103 добавлений и 112 удалений

Просмотреть файл

@ -368,74 +368,32 @@ static inline ucp_ep_h mca_pml_ucx_get_ep(ompi_communicator_t *comm, int rank)
return NULL; return NULL;
} }
static void mca_pml_ucx_waitall(void **reqs, int *count_p)
{
int i;
PML_UCX_VERBOSE(2, "waiting for %d disconnect requests", *count_p);
for (i = 0; i < *count_p; ++i) {
opal_common_ucx_wait_request(reqs[i], ompi_pml_ucx.ucp_worker, "ucp_disconnect_nb");
reqs[i] = NULL;
}
*count_p = 0;
}
int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
{ {
ompi_proc_t *proc; ompi_proc_t *proc;
int num_reqs; opal_common_ucx_del_proc_t *del_procs;
size_t max_reqs;
void *dreq, **dreqs;
ucp_ep_h ep;
size_t i; size_t i;
int ret;
max_reqs = ompi_pml_ucx.num_disconnect; del_procs = malloc(sizeof(*del_procs) * nprocs);
if (max_reqs > nprocs) { if (del_procs == NULL) {
max_reqs = nprocs;
}
dreqs = malloc(sizeof(*dreqs) * max_reqs);
if (dreqs == NULL) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
num_reqs = 0;
for (i = 0; i < nprocs; ++i) { for (i = 0; i < nprocs; ++i) {
proc = procs[(i + OMPI_PROC_MY_NAME->vpid) % nprocs]; proc = procs[i];
ep = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; del_procs[i].ep = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML];
if (ep == NULL) { del_procs[i].vpid = proc->super.proc_name.vpid;
continue;
}
/* mark peer as disconnected */
proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL; proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL;
PML_UCX_VERBOSE(2, "disconnecting from rank %d", proc->super.proc_name.vpid);
dreq = ucp_disconnect_nb(ep);
if (dreq != NULL) {
if (UCS_PTR_IS_ERR(dreq)) {
PML_UCX_ERROR("ucp_disconnect_nb(%d) failed: %s",
proc->super.proc_name.vpid,
ucs_status_string(UCS_PTR_STATUS(dreq)));
continue;
} else {
dreqs[num_reqs++] = dreq;
if (num_reqs >= ompi_pml_ucx.num_disconnect) {
mca_pml_ucx_waitall(dreqs, &num_reqs);
}
}
}
} }
/* num_reqs == 0 is processed by mca_pml_ucx_waitall routine,
* so suppress coverity warning */
/* coverity[uninit_use_in_call] */
mca_pml_ucx_waitall(dreqs, &num_reqs);
free(dreqs);
opal_common_ucx_mca_pmix_fence(ompi_pml_ucx.ucp_worker); ret = opal_common_ucx_del_procs(del_procs, nprocs, OMPI_PROC_MY_NAME->vpid,
ompi_pml_ucx.num_disconnect, ompi_pml_ucx.ucp_worker);
free(del_procs);
return OMPI_SUCCESS; return ret;
} }
int mca_pml_ucx_enable(bool enable) int mca_pml_ucx_enable(bool enable)

Просмотреть файл

@ -107,3 +107,69 @@ OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
} }
} }
static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker_h worker)
{
int i;
MCA_COMMON_UCX_VERBOSE(2, "waiting for %d disconnect requests", count);
for (i = 0; i < count; ++i) {
opal_common_ucx_wait_request(reqs[i], worker, "ucp_disconnect_nb");
reqs[i] = NULL;
}
}
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
size_t my_rank, size_t max_disconnect, ucp_worker_h worker)
{
size_t num_reqs;
size_t max_reqs;
void *dreq, **dreqs;
size_t i;
size_t n;
MCA_COMMON_UCX_ASSERT(procs || !count);
MCA_COMMON_UCX_ASSERT(max_disconnect > 0);
max_reqs = (max_disconnect > count) ? count : max_disconnect;
dreqs = malloc(sizeof(*dreqs) * max_reqs);
if (dreqs == NULL) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
num_reqs = 0;
for (i = 0; i < count; ++i) {
n = (i + my_rank) % count;
if (procs[n].ep == NULL) {
continue;
}
MCA_COMMON_UCX_VERBOSE(2, "disconnecting from rank %zu", procs[n].vpid);
dreq = ucp_disconnect_nb(procs[n].ep);
if (dreq != NULL) {
if (UCS_PTR_IS_ERR(dreq)) {
MCA_COMMON_UCX_ERROR("ucp_disconnect_nb(%zu) failed: %s", procs[n].vpid,
ucs_status_string(UCS_PTR_STATUS(dreq)));
continue;
} else {
dreqs[num_reqs++] = dreq;
if (num_reqs >= max_disconnect) {
opal_common_ucx_wait_all_requests(dreqs, num_reqs, worker);
num_reqs = 0;
}
}
}
}
/* num_reqs == 0 is processed by opal_common_ucx_wait_all_requests routine,
* so suppress coverity warning */
/* coverity[uninit_use_in_call] */
opal_common_ucx_wait_all_requests(dreqs, num_reqs, worker);
free(dreqs);
opal_common_ucx_mca_pmix_fence(worker);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -60,12 +60,19 @@ typedef struct opal_common_ucx_module {
bool opal_mem_hooks; bool opal_mem_hooks;
} opal_common_ucx_module_t; } opal_common_ucx_module_t;
typedef struct opal_common_ucx_del_proc {
ucp_ep_h ep;
size_t vpid;
} opal_common_ucx_del_proc_t;
extern opal_common_ucx_module_t opal_common_ucx; extern opal_common_ucx_module_t opal_common_ucx;
OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
size_t my_rank, size_t max_disconnect, ucp_worker_h worker);
static inline static inline
int opal_common_ucx_wait_request(ucs_status_ptr_t request, ucp_worker_h worker, int opal_common_ucx_wait_request(ucs_status_ptr_t request, ucp_worker_h worker,

Просмотреть файл

@ -103,28 +103,11 @@ int mca_spml_ucx_enable(bool enable)
return OSHMEM_SUCCESS; return OSHMEM_SUCCESS;
} }
static void mca_spml_ucx_waitall(void **reqs, int *count_p)
{
int i;
SPML_UCX_VERBOSE(10, "waiting for %d disconnect requests", *count_p);
for (i = 0; i < *count_p; ++i) {
opal_common_ucx_wait_request(reqs[i], mca_spml_ucx_ctx_default.ucp_worker, "ucp_disconnect_nb");
reqs[i] = NULL;
}
*count_p = 0;
}
int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
{ {
int my_rank = oshmem_my_proc_id(); opal_common_ucx_del_proc_t *del_procs;
int num_reqs; size_t i;
size_t max_reqs; int ret;
void *dreq, **dreqs;
ucp_ep_h ep;
size_t i, n;
oshmem_shmem_barrier(); oshmem_shmem_barrier();
@ -132,53 +115,30 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
return OSHMEM_SUCCESS; return OSHMEM_SUCCESS;
} }
max_reqs = mca_spml_ucx.num_disconnect; del_procs = malloc(sizeof(*del_procs) * nprocs);
if (max_reqs > nprocs) { if (del_procs == NULL) {
max_reqs = nprocs;
}
dreqs = malloc(sizeof(*dreqs) * max_reqs);
if (dreqs == NULL) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
num_reqs = 0;
for (i = 0; i < nprocs; ++i) { for (i = 0; i < nprocs; ++i) {
n = (i + my_rank) % nprocs; del_procs[i].ep = mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn;
ep = mca_spml_ucx_ctx_default.ucp_peers[n].ucp_conn; del_procs[i].vpid = i;
if (ep == NULL) {
continue;
}
mca_spml_ucx_ctx_default.ucp_peers[n].ucp_conn = NULL; /* mark peer as disconnected */
mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL;
SPML_UCX_VERBOSE(10, "disconnecting from peer %zu", n);
dreq = ucp_disconnect_nb(ep);
if (dreq != NULL) {
if (UCS_PTR_IS_ERR(dreq)) {
SPML_UCX_ERROR("ucp_disconnect_nb(%zu) failed: %s", n,
ucs_status_string(UCS_PTR_STATUS(dreq)));
continue;
} else {
dreqs[num_reqs++] = dreq;
if (num_reqs >= mca_spml_ucx.num_disconnect) {
mca_spml_ucx_waitall(dreqs, &num_reqs);
}
}
}
} }
/* num_reqs == 0 is processed by mca_pml_ucx_waitall routine,
* so suppress coverity warning */
/* coverity[uninit_use_in_call] */
mca_spml_ucx_waitall(dreqs, &num_reqs);
free(dreqs);
free(mca_spml_ucx.remote_addrs_tbl);
opal_common_ucx_mca_pmix_fence(mca_spml_ucx_ctx_default.ucp_worker); ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
mca_spml_ucx.num_disconnect,
mca_spml_ucx_ctx_default.ucp_worker);
free(del_procs);
free(mca_spml_ucx.remote_addrs_tbl);
free(mca_spml_ucx_ctx_default.ucp_peers); free(mca_spml_ucx_ctx_default.ucp_peers);
mca_spml_ucx_ctx_default.ucp_peers = NULL; mca_spml_ucx_ctx_default.ucp_peers = NULL;
return OSHMEM_SUCCESS;
return ret;
} }
/* TODO: move func into common place, use it with rkey exchng too */ /* TODO: move func into common place, use it with rkey exchng too */