diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c index 2395572d30..4f238feec8 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -89,7 +89,7 @@ int mca_atomic_mxm_cswap(void *target, /* mxm request init */ sreq.base.state = MXM_REQ_NEW; sreq.base.mq = mca_spml_self->mxm_mq; - sreq.base.conn = mca_spml_self->mxm_peers[pe]->mxm_conn; + sreq.base.conn = mca_spml_self->mxm_peers[pe]->mxm_hw_rdma_conn; sreq.base.completed_cb = NULL; sreq.base.data_type = MXM_REQ_DATA_BUFFER; diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c index 97dced8b75..9be7418870 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -91,7 +91,7 @@ int mca_atomic_mxm_fadd(void *target, /* mxm request init */ sreq.base.state = MXM_REQ_NEW; sreq.base.mq = mca_spml_self->mxm_mq; - sreq.base.conn = mca_spml_self->mxm_peers[pe]->mxm_conn; + sreq.base.conn = mca_spml_self->mxm_peers[pe]->mxm_hw_rdma_conn; sreq.base.completed_cb = NULL; sreq.base.data_type = MXM_REQ_DATA_BUFFER; diff --git a/oshmem/mca/spml/ikrit/help-oshmem-spml-ikrit.txt b/oshmem/mca/spml/ikrit/help-oshmem-spml-ikrit.txt index aacca833bf..6685758b82 100644 --- a/oshmem/mca/spml/ikrit/help-oshmem-spml-ikrit.txt +++ b/oshmem/mca/spml/ikrit/help-oshmem-spml-ikrit.txt @@ -35,9 +35,17 @@ Initialization of MXM library failed. Error: %s -[mxm tls] +[mxm shm tls] ERROR: MXM shared memory transport can not be used bacause it is not fully compliant with OSHMEM spec MXM transport setting: %s +[mxm tls] +ERROR: valid mxm transports are: +"ud" "ud,self" "rc" or "dc" + +transport setting is: %s=%s + + + diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index 8eae93b31a..0ad39f1cb1 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -358,6 +358,9 @@ int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs) if (mca_spml_ikrit.mxm_peers[i]->mxm_conn) { mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn); } + if (mca_spml_ikrit.hw_rdma_channel && mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn) { + mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); + } destroy_ptl_idx(i); if (mca_spml_ikrit.mxm_peers[i]) { OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]); @@ -372,6 +375,7 @@ int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs) int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; + spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; spml_ikrit_mxm_ep_conn_info_t my_ep_info; #if MXM_API < MXM_VERSION(2,0) mxm_conn_req_t *conn_reqs; @@ -402,6 +406,15 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) } memset(ep_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t)); + if (mca_spml_ikrit.hw_rdma_channel) { + ep_hw_rdma_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t)); + if (NULL == ep_hw_rdma_info) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + memset(ep_hw_rdma_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t)); + } + mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs * sizeof(*(mca_spml_ikrit.mxm_peers))); if (NULL == mca_spml_ikrit.mxm_peers) { @@ -419,6 +432,16 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) return OSHMEM_ERROR; } #else + if (mca_spml_ikrit.hw_rdma_channel) { + err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); + if (MXM_OK != err) { + orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } + oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info, + sizeof(spml_ikrit_mxm_ep_conn_info_t)); + } err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, @@ -426,12 +449,11 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) return OSHMEM_ERROR; } #endif - - opal_progress_register(spml_ikrit_progress); - oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); + opal_progress_register(spml_ikrit_progress); + /* Get the EP connection requests for all the processes from modex */ for (i = 0; i < nprocs; ++i) { @@ -457,6 +479,15 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) if (OSHMEM_SUCCESS != create_ptl_idx(i)) goto bail; mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]); + if (mca_spml_ikrit.hw_rdma_channel) { + err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); + if (MXM_OK != err) { + SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); + goto bail; + } + } else { + mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn; + } #endif } @@ -579,7 +610,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, #if MXM_API < MXM_VERSION(2,0) mkeys[i].len = 0; #else - if (mca_spml_ikrit.ud_only) { + if (mca_spml_ikrit.ud_only && !mca_spml_ikrit.hw_rdma_channel) { mkeys[i].len = 0; break; } diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index 878f571ba1..e0ce288065 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -55,6 +55,7 @@ BEGIN_C_DECLS struct mxm_peer { opal_list_item_t super; mxm_conn_h mxm_conn; + mxm_conn_h mxm_hw_rdma_conn; int pe; int32_t n_active_puts; int need_fence; @@ -68,8 +69,10 @@ struct mca_spml_ikrit_t { mxm_context_opts_t *mxm_ctx_opts; mxm_ep_opts_t *mxm_ep_opts; + mxm_ep_opts_t *mxm_ep_hw_rdma_opts; mxm_h mxm_context; mxm_ep_h mxm_ep; + mxm_ep_h mxm_hw_rdma_ep; mxm_mq_h mxm_mq; mxm_peer_t **mxm_peers; @@ -92,6 +95,8 @@ struct mca_spml_ikrit_t { int ud_only; /* only ud transport is used. In this case it is possible to speedup mkey exchange and not to register memheap */ + int hw_rdma_channel; /* true if we provide separate channel that + has true one sided capability */ int np; #if MXM_API >= MXM_VERSION(2,0) int unsync_conn_max; diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.c b/oshmem/mca/spml/ikrit/spml_ikrit_component.c index 5b479fe0c4..207964008c 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit_component.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.c @@ -104,6 +104,19 @@ static inline int set_mxm_tls() } return check_mxm_tls("MXM_TLS"); } + +static inline void set_mxm_rc_tls() +{ + char *tls; + + tls = getenv("MXM_OSHMEM_HW_RDMA_TLS"); + if (NULL != tls) { + return; + } + + setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 1); + return; +} #endif static inline void mca_spml_ikrit_param_register_int(const char* param_name, @@ -156,7 +169,9 @@ static int mca_spml_ikrit_component_register(void) mca_spml_ikrit_param_register_int("priority", 20, "[integer] ikrit priority", &mca_spml_ikrit.priority); - + mca_spml_ikrit_param_register_int("hw_rdma_channel", 0, + "create separate reliable connection channel", + &mca_spml_ikrit.hw_rdma_channel); mca_spml_ikrit_param_register_string("mxm_tls", "rc,ud,self", "[string] TL channels for MXM", @@ -215,15 +230,21 @@ static int mca_spml_ikrit_component_open(void) mca_spml_ikrit.ud_only = 0; #if MXM_API < MXM_VERSION(2,1) + mca_spml_ikrit.rc_channel = 0; if ((MXM_OK != mxm_config_read_context_opts(&mca_spml_ikrit.mxm_ctx_opts)) || (MXM_OK != mxm_config_read_ep_opts(&mca_spml_ikrit.mxm_ep_opts))) #else if (OSHMEM_SUCCESS != set_mxm_tls()) { return OSHMEM_ERROR; } - if (MXM_OK != mxm_config_read_opts(&mca_spml_ikrit.mxm_ctx_opts, - &mca_spml_ikrit.mxm_ep_opts, - "OSHMEM", NULL, 0)) + set_mxm_rc_tls(); + + if ((mca_spml_ikrit.hw_rdma_channel && MXM_OK != mxm_config_read_opts(&mca_spml_ikrit.mxm_ctx_opts, + &mca_spml_ikrit.mxm_ep_hw_rdma_opts, + "OSHMEM_HW_RDMA", NULL, 0)) || + MXM_OK != mxm_config_read_opts(&mca_spml_ikrit.mxm_ctx_opts, + &mca_spml_ikrit.mxm_ep_opts, + "OSHMEM", NULL, 0)) #endif { SPML_ERROR("Failed to parse MXM configuration"); @@ -273,6 +294,8 @@ static int mca_spml_ikrit_component_close(void) #else mxm_config_free_ep_opts(mca_spml_ikrit.mxm_ep_opts); mxm_config_free_context_opts(mca_spml_ikrit.mxm_ctx_opts); + if (mca_spml_ikrit.hw_rdma_channel) + mxm_config_free_ep_opts(mca_spml_ikrit.mxm_ep_hw_rdma_opts); #endif } mca_spml_ikrit.mxm_context = NULL; @@ -302,6 +325,20 @@ static int spml_ikrit_mxm_init(void) mxm_error_string(err)); return OSHMEM_ERROR; } + if (mca_spml_ikrit.hw_rdma_channel) { + err = mxm_ep_create(mca_spml_ikrit.mxm_context, + mca_spml_ikrit.mxm_ep_hw_rdma_opts, + &mca_spml_ikrit.mxm_hw_rdma_ep); + if (MXM_OK != err) { + orte_show_help("help-oshmem-spml-ikrit.txt", + "unable to create endpoint", + true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } + } else { + mca_spml_ikrit.mxm_hw_rdma_ep = mca_spml_ikrit.mxm_ep; + } return OSHMEM_SUCCESS; } @@ -335,6 +372,9 @@ static int mca_spml_ikrit_component_fini(void) if (NULL != mca_spml_ikrit.mxm_ep) { mxm_ep_destroy(mca_spml_ikrit.mxm_ep); } + if (mca_spml_ikrit.hw_rdma_channel) { + mxm_ep_destroy(mca_spml_ikrit.mxm_hw_rdma_ep); + } if(!mca_spml_ikrit.enabled) return OSHMEM_SUCCESS; /* never selected.. return success.. */