2016-11-22 15:03:20 -08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2015 Mellanox Technologies, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
#define _GNU_SOURCE
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#include "oshmem_config.h"
|
|
|
|
#include "orte/util/show_help.h"
|
|
|
|
#include "shmem.h"
|
|
|
|
#include "oshmem/runtime/params.h"
|
|
|
|
#include "oshmem/mca/spml/spml.h"
|
|
|
|
#include "oshmem/mca/spml/base/base.h"
|
|
|
|
#include "spml_ucx_component.h"
|
|
|
|
#include "oshmem/mca/spml/ucx/spml_ucx.h"
|
|
|
|
|
|
|
|
#include "orte/util/show_help.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
|
|
|
|
static int mca_spml_ucx_component_register(void);
|
|
|
|
static int mca_spml_ucx_component_open(void);
|
|
|
|
static int mca_spml_ucx_component_close(void);
|
|
|
|
static mca_spml_base_module_t*
|
|
|
|
mca_spml_ucx_component_init(int* priority,
|
|
|
|
bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads);
|
|
|
|
static int mca_spml_ucx_component_fini(void);
|
|
|
|
mca_spml_base_component_2_0_0_t mca_spml_ucx_component = {
|
|
|
|
|
|
|
|
/* First, the mca_base_component_t struct containing meta
|
|
|
|
information about the component itself */
|
|
|
|
|
2018-07-25 14:52:45 +03:00
|
|
|
.spmlm_version = {
|
|
|
|
MCA_SPML_BASE_VERSION_2_0_0,
|
|
|
|
|
|
|
|
.mca_component_name = "ucx",
|
|
|
|
.mca_component_major_version = OSHMEM_MAJOR_VERSION,
|
|
|
|
.mca_component_minor_version = OSHMEM_MINOR_VERSION,
|
|
|
|
.mca_component_release_version = OSHMEM_RELEASE_VERSION,
|
|
|
|
.mca_open_component = mca_spml_ucx_component_open,
|
|
|
|
.mca_close_component = mca_spml_ucx_component_close,
|
|
|
|
.mca_query_component = NULL,
|
|
|
|
.mca_register_component_params = mca_spml_ucx_component_register
|
2016-11-22 15:03:20 -08:00
|
|
|
},
|
2018-07-25 14:52:45 +03:00
|
|
|
.spmlm_data = {
|
2016-11-22 15:03:20 -08:00
|
|
|
/* The component is checkpoint ready */
|
2018-07-25 14:52:45 +03:00
|
|
|
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
|
2016-11-22 15:03:20 -08:00
|
|
|
},
|
|
|
|
|
2018-07-25 14:52:45 +03:00
|
|
|
.spmlm_init = mca_spml_ucx_component_init,
|
|
|
|
.spmlm_finalize = mca_spml_ucx_component_fini
|
2016-11-22 15:03:20 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
static inline void mca_spml_ucx_param_register_int(const char* param_name,
|
|
|
|
int default_value,
|
|
|
|
const char *help_msg,
|
|
|
|
int *storage)
|
|
|
|
{
|
|
|
|
*storage = default_value;
|
|
|
|
(void) mca_base_component_var_register(&mca_spml_ucx_component.spmlm_version,
|
|
|
|
param_name,
|
|
|
|
help_msg,
|
|
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
|
|
OPAL_INFO_LVL_9,
|
|
|
|
MCA_BASE_VAR_SCOPE_READONLY,
|
|
|
|
storage);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mca_spml_ucx_param_register_string(const char* param_name,
|
|
|
|
char* default_value,
|
|
|
|
const char *help_msg,
|
|
|
|
char **storage)
|
|
|
|
{
|
|
|
|
*storage = default_value;
|
|
|
|
(void) mca_base_component_var_register(&mca_spml_ucx_component.spmlm_version,
|
|
|
|
param_name,
|
|
|
|
help_msg,
|
|
|
|
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
|
|
|
OPAL_INFO_LVL_9,
|
|
|
|
MCA_BASE_VAR_SCOPE_READONLY,
|
|
|
|
storage);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mca_spml_ucx_component_register(void)
|
|
|
|
{
|
2018-01-17 15:08:04 +02:00
|
|
|
mca_spml_ucx_param_register_int("priority", 21,
|
2016-11-22 15:03:20 -08:00
|
|
|
"[integer] ucx priority",
|
|
|
|
&mca_spml_ucx.priority);
|
|
|
|
|
|
|
|
mca_spml_ucx_param_register_int("num_disconnect", 1,
|
|
|
|
"How may disconnects go in parallel",
|
|
|
|
&mca_spml_ucx.num_disconnect);
|
|
|
|
|
2016-11-24 20:07:51 +02:00
|
|
|
mca_spml_ucx_param_register_int("heap_reg_nb", 0,
|
|
|
|
"Use non-blocking memory registration for shared heap",
|
|
|
|
&mca_spml_ucx.heap_reg_nb);
|
|
|
|
|
2018-08-21 15:05:12 +03:00
|
|
|
opal_common_ucx_mca_var_register(&mca_spml_ucx_component.spmlm_version);
|
|
|
|
|
2016-11-22 15:03:20 -08:00
|
|
|
return OSHMEM_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
int spml_ucx_ctx_progress(void)
|
2016-11-22 15:03:20 -08:00
|
|
|
{
|
2019-03-13 04:39:26 +02:00
|
|
|
int i;
|
|
|
|
for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) {
|
|
|
|
ucp_worker_progress(mca_spml_ucx.active_array.ctxs[i]->ucp_worker);
|
2019-01-03 18:20:30 -08:00
|
|
|
}
|
2019-03-13 04:39:26 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int spml_ucx_default_progress(void)
|
|
|
|
{
|
|
|
|
ucp_worker_progress(mca_spml_ucx_ctx_default.ucp_worker);
|
2016-11-22 15:03:20 -08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mca_spml_ucx_component_open(void)
|
2018-07-13 10:08:54 -07:00
|
|
|
{
|
|
|
|
return OSHMEM_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mca_spml_ucx_component_close(void)
|
|
|
|
{
|
|
|
|
return OSHMEM_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int spml_ucx_init(void)
|
2016-11-22 15:03:20 -08:00
|
|
|
{
|
|
|
|
ucs_status_t err;
|
|
|
|
ucp_config_t *ucp_config;
|
|
|
|
ucp_params_t params;
|
2018-07-13 10:08:54 -07:00
|
|
|
ucp_context_attr_t attr;
|
|
|
|
ucp_worker_params_t wkr_params;
|
|
|
|
ucp_worker_attr_t wkr_attr;
|
2016-11-22 15:03:20 -08:00
|
|
|
|
|
|
|
err = ucp_config_read("OSHMEM", NULL, &ucp_config);
|
|
|
|
if (UCS_OK != err) {
|
|
|
|
return OSHMEM_ERROR;
|
|
|
|
}
|
|
|
|
|
2018-07-05 15:04:37 +03:00
|
|
|
opal_common_ucx_mca_register();
|
|
|
|
|
2016-11-22 15:03:20 -08:00
|
|
|
memset(¶ms, 0, sizeof(params));
|
2018-07-13 10:08:54 -07:00
|
|
|
params.field_mask = UCP_PARAM_FIELD_FEATURES|UCP_PARAM_FIELD_ESTIMATED_NUM_EPS|UCP_PARAM_FIELD_MT_WORKERS_SHARED;
|
2016-12-19 18:56:26 +02:00
|
|
|
params.features = UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64;
|
2017-09-04 14:46:00 +03:00
|
|
|
params.estimated_num_eps = ompi_proc_world_size();
|
2018-07-13 10:08:54 -07:00
|
|
|
if (oshmem_mpi_thread_requested == SHMEM_THREAD_MULTIPLE) {
|
|
|
|
params.mt_workers_shared = 1;
|
|
|
|
} else {
|
|
|
|
params.mt_workers_shared = 0;
|
|
|
|
}
|
2016-11-22 15:03:20 -08:00
|
|
|
|
|
|
|
err = ucp_init(¶ms, ucp_config, &mca_spml_ucx.ucp_context);
|
|
|
|
ucp_config_release(ucp_config);
|
|
|
|
if (UCS_OK != err) {
|
|
|
|
return OSHMEM_ERROR;
|
|
|
|
}
|
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
attr.field_mask = UCP_ATTR_FIELD_THREAD_MODE;
|
|
|
|
err = ucp_context_query(mca_spml_ucx.ucp_context, &attr);
|
|
|
|
if (err != UCS_OK) {
|
|
|
|
return OSHMEM_ERROR;
|
|
|
|
}
|
2016-11-22 15:03:20 -08:00
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
if (oshmem_mpi_thread_requested == SHMEM_THREAD_MULTIPLE &&
|
|
|
|
attr.thread_mode != UCS_THREAD_MODE_MULTI) {
|
|
|
|
oshmem_mpi_thread_provided = SHMEM_THREAD_SINGLE;
|
2016-11-22 15:03:20 -08:00
|
|
|
}
|
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
mca_spml_ucx.active_array.ctxs_count = mca_spml_ucx.idle_array.ctxs_count = 0;
|
|
|
|
mca_spml_ucx.active_array.ctxs_num = mca_spml_ucx.idle_array.ctxs_num = MCA_SPML_UCX_CTXS_ARRAY_SIZE;
|
|
|
|
mca_spml_ucx.active_array.ctxs = calloc(mca_spml_ucx.active_array.ctxs_num,
|
|
|
|
sizeof(mca_spml_ucx_ctx_t *));
|
|
|
|
mca_spml_ucx.idle_array.ctxs = calloc(mca_spml_ucx.idle_array.ctxs_num,
|
|
|
|
sizeof(mca_spml_ucx_ctx_t *));
|
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
SHMEM_MUTEX_INIT(mca_spml_ucx.internal_mutex);
|
2016-11-22 15:03:20 -08:00
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
wkr_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
|
|
|
|
if (oshmem_mpi_thread_requested == SHMEM_THREAD_MULTIPLE) {
|
|
|
|
wkr_params.thread_mode = UCS_THREAD_MODE_MULTI;
|
|
|
|
} else {
|
|
|
|
wkr_params.thread_mode = UCS_THREAD_MODE_SINGLE;
|
|
|
|
}
|
2016-12-07 10:51:18 +02:00
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
err = ucp_worker_create(mca_spml_ucx.ucp_context, &wkr_params,
|
|
|
|
&mca_spml_ucx_ctx_default.ucp_worker);
|
2016-11-22 15:03:20 -08:00
|
|
|
if (UCS_OK != err) {
|
|
|
|
return OSHMEM_ERROR;
|
|
|
|
}
|
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
wkr_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE;
|
|
|
|
err = ucp_worker_query(mca_spml_ucx_ctx_default.ucp_worker, &wkr_attr);
|
|
|
|
|
|
|
|
if (oshmem_mpi_thread_requested == SHMEM_THREAD_MULTIPLE &&
|
|
|
|
wkr_attr.thread_mode != UCS_THREAD_MODE_MULTI) {
|
|
|
|
oshmem_mpi_thread_provided = SHMEM_THREAD_SINGLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
oshmem_ctx_default = (shmem_ctx_t) &mca_spml_ucx_ctx_default;
|
|
|
|
|
2016-11-22 15:03:20 -08:00
|
|
|
return OSHMEM_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static mca_spml_base_module_t*
|
|
|
|
mca_spml_ucx_component_init(int* priority,
|
|
|
|
bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads)
|
|
|
|
{
|
2018-07-05 15:04:37 +03:00
|
|
|
SPML_UCX_VERBOSE( 10, "in ucx, my priority is %d\n", mca_spml_ucx.priority);
|
2016-11-22 15:03:20 -08:00
|
|
|
|
|
|
|
if ((*priority) > mca_spml_ucx.priority) {
|
|
|
|
*priority = mca_spml_ucx.priority;
|
|
|
|
return NULL ;
|
|
|
|
}
|
|
|
|
*priority = mca_spml_ucx.priority;
|
|
|
|
|
|
|
|
if (OSHMEM_SUCCESS != spml_ucx_init())
|
|
|
|
return NULL ;
|
|
|
|
|
2018-07-05 15:04:37 +03:00
|
|
|
SPML_UCX_VERBOSE(50, "*** ucx initialized ****");
|
2016-11-22 15:03:20 -08:00
|
|
|
return &mca_spml_ucx.super;
|
|
|
|
}
|
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
static void _ctx_cleanup(mca_spml_ucx_ctx_t *ctx)
|
2019-03-08 06:19:39 +02:00
|
|
|
{
|
|
|
|
int i, j, nprocs = oshmem_num_procs();
|
|
|
|
opal_common_ucx_del_proc_t *del_procs;
|
|
|
|
|
|
|
|
del_procs = malloc(sizeof(*del_procs) * nprocs);
|
|
|
|
|
|
|
|
for (i = 0; i < nprocs; ++i) {
|
|
|
|
for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) {
|
2019-03-13 04:39:26 +02:00
|
|
|
if (ctx->ucp_peers[i].mkeys[j].key.rkey != NULL) {
|
|
|
|
ucp_rkey_destroy(ctx->ucp_peers[i].mkeys[j].key.rkey);
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
del_procs[i].ep = ctx->ucp_peers[i].ucp_conn;
|
2019-03-08 06:19:39 +02:00
|
|
|
del_procs[i].vpid = i;
|
2019-03-13 04:39:26 +02:00
|
|
|
ctx->ucp_peers[i].ucp_conn = NULL;
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(),
|
|
|
|
mca_spml_ucx.num_disconnect,
|
2019-03-13 04:39:26 +02:00
|
|
|
ctx->ucp_worker);
|
2019-03-08 06:19:39 +02:00
|
|
|
free(del_procs);
|
2019-03-13 04:39:26 +02:00
|
|
|
free(ctx->ucp_peers);
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
|
|
|
|
2016-11-22 15:03:20 -08:00
|
|
|
static int mca_spml_ucx_component_fini(void)
|
|
|
|
{
|
2019-03-13 04:39:26 +02:00
|
|
|
int fenced = 0, i;
|
2019-03-08 06:19:39 +02:00
|
|
|
int ret = OSHMEM_SUCCESS;
|
2019-03-07 04:15:08 +02:00
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
opal_progress_unregister(spml_ucx_default_progress);
|
|
|
|
if (mca_spml_ucx.active_array.ctxs_count) {
|
|
|
|
opal_progress_unregister(spml_ucx_ctx_progress);
|
|
|
|
}
|
2019-03-07 04:15:08 +02:00
|
|
|
|
|
|
|
if(!mca_spml_ucx.enabled)
|
|
|
|
return OSHMEM_SUCCESS; /* never selected.. return success.. */
|
|
|
|
|
|
|
|
/* delete context objects from list */
|
2019-03-13 04:39:26 +02:00
|
|
|
for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) {
|
|
|
|
_ctx_cleanup(mca_spml_ucx.active_array.ctxs[i]);
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
2019-03-07 04:15:08 +02:00
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
for (i = 0; i < mca_spml_ucx.idle_array.ctxs_count; i++) {
|
|
|
|
_ctx_cleanup(mca_spml_ucx.idle_array.ctxs[i]);
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
2019-03-07 04:15:08 +02:00
|
|
|
|
2019-03-08 06:19:39 +02:00
|
|
|
ret = opal_common_ucx_mca_pmix_fence_nb(&fenced);
|
|
|
|
if (OPAL_SUCCESS != ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
2019-03-07 04:15:08 +02:00
|
|
|
|
2019-03-08 06:19:39 +02:00
|
|
|
while (!fenced) {
|
2019-03-13 04:39:26 +02:00
|
|
|
for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) {
|
|
|
|
ucp_worker_progress(mca_spml_ucx.active_array.ctxs[i]->ucp_worker);
|
2019-03-07 04:15:08 +02:00
|
|
|
}
|
2019-03-13 04:39:26 +02:00
|
|
|
|
|
|
|
for (i = 0; i < mca_spml_ucx.idle_array.ctxs_count; i++) {
|
|
|
|
ucp_worker_progress(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker);
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
2019-03-13 04:39:26 +02:00
|
|
|
|
2019-03-08 06:19:39 +02:00
|
|
|
ucp_worker_progress(mca_spml_ucx_ctx_default.ucp_worker);
|
|
|
|
}
|
2019-03-07 04:15:08 +02:00
|
|
|
|
2019-03-08 06:19:39 +02:00
|
|
|
/* delete all workers */
|
2019-03-13 04:39:26 +02:00
|
|
|
for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) {
|
|
|
|
ucp_worker_destroy(mca_spml_ucx.active_array.ctxs[i]->ucp_worker);
|
|
|
|
free(mca_spml_ucx.active_array.ctxs[i]);
|
2019-03-08 06:19:39 +02:00
|
|
|
}
|
2019-03-13 04:39:26 +02:00
|
|
|
|
|
|
|
for (i = 0; i < mca_spml_ucx.idle_array.ctxs_count; i++) {
|
|
|
|
ucp_worker_destroy(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker);
|
|
|
|
free(mca_spml_ucx.idle_array.ctxs[i]);
|
2019-03-07 04:15:08 +02:00
|
|
|
}
|
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
if (mca_spml_ucx_ctx_default.ucp_worker) {
|
|
|
|
ucp_worker_destroy(mca_spml_ucx_ctx_default.ucp_worker);
|
2016-11-22 15:03:20 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
mca_spml_ucx.enabled = false; /* not anymore */
|
2018-07-13 10:08:54 -07:00
|
|
|
|
2019-03-13 04:39:26 +02:00
|
|
|
free(mca_spml_ucx.active_array.ctxs);
|
|
|
|
free(mca_spml_ucx.idle_array.ctxs);
|
|
|
|
|
2018-07-13 10:08:54 -07:00
|
|
|
SHMEM_MUTEX_DESTROY(mca_spml_ucx.internal_mutex);
|
|
|
|
|
|
|
|
if (mca_spml_ucx.ucp_context) {
|
|
|
|
ucp_cleanup(mca_spml_ucx.ucp_context);
|
|
|
|
mca_spml_ucx.ucp_context = NULL;
|
|
|
|
}
|
|
|
|
|
2016-11-22 15:03:20 -08:00
|
|
|
return OSHMEM_SUCCESS;
|
|
|
|
}
|
|
|
|
|