1
1

Merge branch 'topic/oshmem_spml_ikrit_hw_rdma_channel-fix1'

Этот коммит содержится в:
Alex Mikheev 2014-11-04 12:03:06 +02:00
родитель a4c0019153 1f2ab43ba9
Коммит a2c85c6d16
2 изменённых файлов: 52 добавлений и 31 удалений

Просмотреть файл

@ -7,45 +7,39 @@
# #
# $HEADER$ # $HEADER$
# #
[unable to create endpoint] [unable to create endpoint]
MXM was unable to create an endpoint. Please make sure that the network link is MXM was unable to create an endpoint. Please make sure that the network link is
active on the node and the hardware is functioning. active on the node and the hardware is functioning.
Error: %s Error: %s
#
[unable to get endpoint address] [unable to get endpoint address]
MXM was unable to get endpoint address MXM was unable to get endpoint address
Error: %s Error: %s
#
[mxm mq create] [mxm mq create]
Failed to create MQ for endpoint Failed to create MQ for endpoint
Error: %s Error: %s
#
[errors during mxm_progress] [errors during mxm_progress]
Error %s occurred in attempting to make network progress (mxm_progress). Error %s occurred in attempting to make network progress (mxm_progress).
#
[mxm init] [mxm init]
Initialization of MXM library failed. Initialization of MXM library failed.
Error: %s Error: %s
#
[mxm shm tls] [mxm shm tls]
ERROR: MXM shared memory transport can not be used ERROR: MXM shared memory transport can not be used
bacause it is not fully compliant with OSHMEM spec bacause it is not fully compliant with OSHMEM spec
MXM transport setting: %s MXM transport setting: %s
#
[mxm tls] [mxm tls]
ERROR: valid mxm transports are: ERROR: valid mxm transports are:
"ud" "ud,self" "rc" or "dc" "ud" "ud,self" "rc" or "dc"
transport setting is: %s=%s transport setting is: %s=%s
#

Просмотреть файл

@ -74,7 +74,7 @@ static inline int check_mxm_tls(char *var)
"%s=%s", "%s=%s",
var, getenv(var) var, getenv(var)
)) { )) {
orte_show_help("help-oshmem-spml-ikrit.txt", "mxm tls", true, orte_show_help("help-oshmem-spml-ikrit.txt", "mxm shm tls", true,
str); str);
free(str); free(str);
} }
@ -108,21 +108,40 @@ static inline int set_mxm_tls()
setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1); setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1);
return OSHMEM_SUCCESS; return OSHMEM_SUCCESS;
} }
return check_mxm_tls("MXM_TLS"); if (OSHMEM_SUCCESS == check_mxm_tls("MXM_TLS")) {
} setenv("MXM_OSHMEM_TLS", tls, 1);
return OSHMEM_SUCCESS;
static inline void set_mxm_rc_tls()
{
char *tls;
tls = getenv("MXM_OSHMEM_HW_RDMA_TLS");
if (NULL != tls) {
return;
} }
return OSHMEM_ERROR;
setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 1);
return;
} }
static inline int check_mxm_hw_tls(char *v, char *tls)
{
if ((0 == strcmp(tls, "rc") || 0 == strcmp(tls, "dc")))
return OSHMEM_SUCCESS;
if (strstr(tls, "ud") &&
(NULL == strstr(tls, "rc") && NULL == strstr(tls, "dc") &&
NULL == strstr(tls, "shm")))
return OSHMEM_SUCCESS;
orte_show_help("help-oshmem-spml-ikrit.txt", "mxm tls", true,
v, tls);
return OSHMEM_ERROR;
}
static inline int set_mxm_hw_rdma_tls()
{
if (!mca_spml_ikrit.hw_rdma_channel) {
return check_mxm_hw_tls("MXM_OSHMEM_TLS", getenv("MXM_OSHMEM_TLS"));
}
setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0);
setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0);
return check_mxm_hw_tls("MXM_OSHMEM_HW_RDMA_TLS",
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
}
#endif #endif
static inline void mca_spml_ikrit_param_register_int(const char* param_name, static inline void mca_spml_ikrit_param_register_int(const char* param_name,
@ -157,6 +176,8 @@ static inline void mca_spml_ikrit_param_register_string(const char* param_name,
static int mca_spml_ikrit_component_register(void) static int mca_spml_ikrit_component_register(void)
{ {
char *v;
mca_spml_ikrit_param_register_int("free_list_num", 1024, mca_spml_ikrit_param_register_int("free_list_num", 1024,
0, 0,
&mca_spml_ikrit.free_list_num); &mca_spml_ikrit.free_list_num);
@ -178,8 +199,13 @@ static int mca_spml_ikrit_component_register(void)
mca_spml_ikrit_param_register_int("hw_rdma_channel", 0, mca_spml_ikrit_param_register_int("hw_rdma_channel", 0,
"create separate reliable connection channel", "create separate reliable connection channel",
&mca_spml_ikrit.hw_rdma_channel); &mca_spml_ikrit.hw_rdma_channel);
if (!mca_spml_ikrit.hw_rdma_channel)
v = "ud,self";
else
v = "rc,ud,self";
mca_spml_ikrit_param_register_string("mxm_tls", mca_spml_ikrit_param_register_string("mxm_tls",
"rc,ud,self", v,
"[string] TL channels for MXM", "[string] TL channels for MXM",
&mca_spml_ikrit.mxm_tls); &mca_spml_ikrit.mxm_tls);
@ -236,15 +262,16 @@ static int mca_spml_ikrit_component_open(void)
mca_spml_ikrit.ud_only = 0; mca_spml_ikrit.ud_only = 0;
#if MXM_API < MXM_VERSION(2,1) #if MXM_API < MXM_VERSION(2,1)
mca_spml_ikrit.rc_channel = 0; mca_spml_ikrit.hw_rdma_channel = 0;
if ((MXM_OK != mxm_config_read_context_opts(&mca_spml_ikrit.mxm_ctx_opts)) || if ((MXM_OK != mxm_config_read_context_opts(&mca_spml_ikrit.mxm_ctx_opts)) ||
(MXM_OK != mxm_config_read_ep_opts(&mca_spml_ikrit.mxm_ep_opts))) (MXM_OK != mxm_config_read_ep_opts(&mca_spml_ikrit.mxm_ep_opts)))
#else #else
if (OSHMEM_SUCCESS != set_mxm_tls()) { if (OSHMEM_SUCCESS != set_mxm_tls()) {
return OSHMEM_ERROR; return OSHMEM_ERROR;
} }
set_mxm_rc_tls(); if (OSHMEM_SUCCESS != set_mxm_hw_rdma_tls()) {
return OSHMEM_ERROR;
}
if ((mca_spml_ikrit.hw_rdma_channel && MXM_OK != mxm_config_read_opts(&mca_spml_ikrit.mxm_ctx_opts, if ((mca_spml_ikrit.hw_rdma_channel && MXM_OK != mxm_config_read_opts(&mca_spml_ikrit.mxm_ctx_opts,
&mca_spml_ikrit.mxm_ep_hw_rdma_opts, &mca_spml_ikrit.mxm_ep_hw_rdma_opts,
"OSHMEM_HW_RDMA", NULL, 0)) || "OSHMEM_HW_RDMA", NULL, 0)) ||