ompi/oshmem/spml/ucx: defer clean up shmem_ctx to shmem_finalize
Signed-off-by: Tomislav Janjusic <tomislavj@mellanox.com>
Этот коммит содержится в:
родитель
48033ac1f4
Коммит
e1c1ab0202
@ -151,6 +151,10 @@ void opal_common_ucx_mca_proc_added(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence_nb(int *fenced)
|
||||||
|
{
|
||||||
|
return opal_pmix.fence_nb(NULL, 0, opal_common_ucx_mca_fence_complete_cb, (void *)fenced);
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
||||||
@ -182,9 +186,8 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
|
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count,
|
||||||
size_t my_rank, size_t max_disconnect, ucp_worker_h worker)
|
size_t my_rank, size_t max_disconnect, ucp_worker_h worker) {
|
||||||
{
|
|
||||||
size_t num_reqs;
|
size_t num_reqs;
|
||||||
size_t max_reqs;
|
size_t max_reqs;
|
||||||
void *dreq, **dreqs;
|
void *dreq, **dreqs;
|
||||||
@ -232,10 +235,14 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s
|
|||||||
opal_common_ucx_wait_all_requests(dreqs, num_reqs, worker);
|
opal_common_ucx_wait_all_requests(dreqs, num_reqs, worker);
|
||||||
free(dreqs);
|
free(dreqs);
|
||||||
|
|
||||||
if (OPAL_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence(worker))) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
|
||||||
|
size_t my_rank, size_t max_disconnect, ucp_worker_h worker)
|
||||||
|
{
|
||||||
|
opal_common_ucx_del_procs_nofence(procs, count, my_rank, max_disconnect, worker);
|
||||||
|
|
||||||
|
return opal_common_ucx_mca_pmix_fence(worker);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -105,8 +105,11 @@ OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
|||||||
OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void);
|
OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
||||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
||||||
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence_nb(int *fenced);
|
||||||
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
|
OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, size_t count,
|
||||||
size_t my_rank, size_t max_disconnect, ucp_worker_h worker);
|
size_t my_rank, size_t max_disconnect, ucp_worker_h worker);
|
||||||
|
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count,
|
||||||
|
size_t my_rank, size_t max_disconnect, ucp_worker_h worker);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component);
|
OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component);
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
|
@ -654,30 +654,7 @@ void mca_spml_ucx_ctx_destroy(shmem_ctx_t ctx)
|
|||||||
mca_spml_ucx_ctx_list_item_t) {
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
if ((shmem_ctx_t)(&ctx_item->ctx) == ctx) {
|
if ((shmem_ctx_t)(&ctx_item->ctx) == ctx) {
|
||||||
opal_list_remove_item(&(mca_spml_ucx.ctx_list), &ctx_item->super);
|
opal_list_remove_item(&(mca_spml_ucx.ctx_list), &ctx_item->super);
|
||||||
|
opal_list_append(&(mca_spml_ucx.idle_ctx_list), &ctx_item->super);
|
||||||
opal_common_ucx_del_proc_t *del_procs;
|
|
||||||
del_procs = malloc(sizeof(*del_procs) * nprocs);
|
|
||||||
|
|
||||||
for (i = 0; i < nprocs; ++i) {
|
|
||||||
for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) {
|
|
||||||
if (ctx_item->ctx.ucp_peers[i].mkeys[j].key.rkey != NULL) {
|
|
||||||
ucp_rkey_destroy(ctx_item->ctx.ucp_peers[i].mkeys[j].key.rkey);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
del_procs[i].ep = ctx_item->ctx.ucp_peers[i].ucp_conn;
|
|
||||||
del_procs[i].vpid = i;
|
|
||||||
ctx_item->ctx.ucp_peers[i].ucp_conn = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
|
|
||||||
mca_spml_ucx.num_disconnect,
|
|
||||||
ctx_item->ctx.ucp_worker);
|
|
||||||
free(del_procs);
|
|
||||||
free(ctx_item->ctx.ucp_peers);
|
|
||||||
|
|
||||||
ucp_worker_destroy(ctx_item->ctx.ucp_worker);
|
|
||||||
OBJ_RELEASE(ctx_item);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -91,6 +91,7 @@ struct mca_spml_ucx {
|
|||||||
mca_spml_ucx_get_mkey_slow_fn_t get_mkey_slow;
|
mca_spml_ucx_get_mkey_slow_fn_t get_mkey_slow;
|
||||||
char **remote_addrs_tbl;
|
char **remote_addrs_tbl;
|
||||||
opal_list_t ctx_list;
|
opal_list_t ctx_list;
|
||||||
|
opal_list_t idle_ctx_list;
|
||||||
int priority; /* component priority */
|
int priority; /* component priority */
|
||||||
shmem_internal_mutex_t internal_mutex;
|
shmem_internal_mutex_t internal_mutex;
|
||||||
};
|
};
|
||||||
|
@ -176,6 +176,7 @@ static int spml_ucx_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&(mca_spml_ucx.ctx_list), opal_list_t);
|
OBJ_CONSTRUCT(&(mca_spml_ucx.ctx_list), opal_list_t);
|
||||||
|
OBJ_CONSTRUCT(&(mca_spml_ucx.idle_ctx_list), opal_list_t);
|
||||||
SHMEM_MUTEX_INIT(mca_spml_ucx.internal_mutex);
|
SHMEM_MUTEX_INIT(mca_spml_ucx.internal_mutex);
|
||||||
|
|
||||||
wkr_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
|
wkr_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
|
||||||
@ -224,10 +225,37 @@ mca_spml_ucx_component_init(int* priority,
|
|||||||
return &mca_spml_ucx.super;
|
return &mca_spml_ucx.super;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void _ctx_cleanup(mca_spml_ucx_ctx_list_item_t *ctx_item)
|
||||||
|
{
|
||||||
|
int i, j, nprocs = oshmem_num_procs();
|
||||||
|
opal_common_ucx_del_proc_t *del_procs;
|
||||||
|
|
||||||
|
del_procs = malloc(sizeof(*del_procs) * nprocs);
|
||||||
|
|
||||||
|
for (i = 0; i < nprocs; ++i) {
|
||||||
|
for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) {
|
||||||
|
if (ctx_item->ctx.ucp_peers[i].mkeys[j].key.rkey != NULL) {
|
||||||
|
ucp_rkey_destroy(ctx_item->ctx.ucp_peers[i].mkeys[j].key.rkey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
del_procs[i].ep = ctx_item->ctx.ucp_peers[i].ucp_conn;
|
||||||
|
del_procs[i].vpid = i;
|
||||||
|
ctx_item->ctx.ucp_peers[i].ucp_conn = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(),
|
||||||
|
mca_spml_ucx.num_disconnect,
|
||||||
|
ctx_item->ctx.ucp_worker);
|
||||||
|
free(del_procs);
|
||||||
|
free(ctx_item->ctx.ucp_peers);
|
||||||
|
}
|
||||||
|
|
||||||
static int mca_spml_ucx_component_fini(void)
|
static int mca_spml_ucx_component_fini(void)
|
||||||
{
|
{
|
||||||
mca_spml_ucx_ctx_list_item_t *ctx_item, *next;
|
mca_spml_ucx_ctx_list_item_t *ctx_item, *next;
|
||||||
size_t i, j, nprocs = oshmem_num_procs();
|
int fenced = 0;
|
||||||
|
int ret = OSHMEM_SUCCESS;
|
||||||
|
|
||||||
opal_progress_unregister(spml_ucx_progress);
|
opal_progress_unregister(spml_ucx_progress);
|
||||||
|
|
||||||
@ -235,31 +263,43 @@ static int mca_spml_ucx_component_fini(void)
|
|||||||
return OSHMEM_SUCCESS; /* never selected.. return success.. */
|
return OSHMEM_SUCCESS; /* never selected.. return success.. */
|
||||||
|
|
||||||
/* delete context objects from list */
|
/* delete context objects from list */
|
||||||
|
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.idle_ctx_list),
|
||||||
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
|
_ctx_cleanup(ctx_item);
|
||||||
|
}
|
||||||
|
|
||||||
|
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.ctx_list),
|
||||||
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
|
_ctx_cleanup(ctx_item);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = opal_common_ucx_mca_pmix_fence_nb(&fenced);
|
||||||
|
if (OPAL_SUCCESS != ret) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!fenced) {
|
||||||
|
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.ctx_list),
|
||||||
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
|
ucp_worker_progress(ctx_item->ctx.ucp_worker);
|
||||||
|
}
|
||||||
|
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.idle_ctx_list),
|
||||||
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
|
ucp_worker_progress(ctx_item->ctx.ucp_worker);
|
||||||
|
}
|
||||||
|
ucp_worker_progress(mca_spml_ucx_ctx_default.ucp_worker);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* delete all workers */
|
||||||
|
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.idle_ctx_list),
|
||||||
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
|
opal_list_remove_item(&(mca_spml_ucx.idle_ctx_list), &ctx_item->super);
|
||||||
|
ucp_worker_destroy(ctx_item->ctx.ucp_worker);
|
||||||
|
OBJ_RELEASE(ctx_item);
|
||||||
|
}
|
||||||
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.ctx_list),
|
OPAL_LIST_FOREACH_SAFE(ctx_item, next, &(mca_spml_ucx.ctx_list),
|
||||||
mca_spml_ucx_ctx_list_item_t) {
|
mca_spml_ucx_ctx_list_item_t) {
|
||||||
opal_list_remove_item(&(mca_spml_ucx.ctx_list), &ctx_item->super);
|
opal_list_remove_item(&(mca_spml_ucx.ctx_list), &ctx_item->super);
|
||||||
|
|
||||||
opal_common_ucx_del_proc_t *del_procs;
|
|
||||||
del_procs = malloc(sizeof(*del_procs) * nprocs);
|
|
||||||
|
|
||||||
for (i = 0; i < nprocs; ++i) {
|
|
||||||
for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) {
|
|
||||||
if (ctx_item->ctx.ucp_peers[i].mkeys[j].key.rkey != NULL) {
|
|
||||||
ucp_rkey_destroy(ctx_item->ctx.ucp_peers[i].mkeys[j].key.rkey);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
del_procs[i].ep = ctx_item->ctx.ucp_peers[i].ucp_conn;
|
|
||||||
del_procs[i].vpid = i;
|
|
||||||
ctx_item->ctx.ucp_peers[i].ucp_conn = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
|
|
||||||
mca_spml_ucx.num_disconnect,
|
|
||||||
ctx_item->ctx.ucp_worker);
|
|
||||||
free(del_procs);
|
|
||||||
free(ctx_item->ctx.ucp_peers);
|
|
||||||
|
|
||||||
ucp_worker_destroy(ctx_item->ctx.ucp_worker);
|
ucp_worker_destroy(ctx_item->ctx.ucp_worker);
|
||||||
OBJ_RELEASE(ctx_item);
|
OBJ_RELEASE(ctx_item);
|
||||||
}
|
}
|
||||||
@ -271,6 +311,7 @@ static int mca_spml_ucx_component_fini(void)
|
|||||||
mca_spml_ucx.enabled = false; /* not anymore */
|
mca_spml_ucx.enabled = false; /* not anymore */
|
||||||
|
|
||||||
OBJ_DESTRUCT(&(mca_spml_ucx.ctx_list));
|
OBJ_DESTRUCT(&(mca_spml_ucx.ctx_list));
|
||||||
|
OBJ_DESTRUCT(&(mca_spml_ucx.idle_ctx_list));
|
||||||
SHMEM_MUTEX_DESTROY(mca_spml_ucx.internal_mutex);
|
SHMEM_MUTEX_DESTROY(mca_spml_ucx.internal_mutex);
|
||||||
|
|
||||||
if (mca_spml_ucx.ucp_context) {
|
if (mca_spml_ucx.ucp_context) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user