1
1

Merge pull request #3648 from rhc54/topic/ofi

Clean up the conduit open code so we return detectable errors when co…
Этот коммит содержится в:
Ralph Castain 2017-06-02 18:08:55 -07:00 коммит произвёл GitHub
родитель 68a22689c4 e884cbf5f5
Коммит 51b4078b70
9 изменённых файлов: 81 добавлений и 65 удалений

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -142,20 +142,12 @@ enum {
ORTE_ERR_ALLOCATION_PENDING = (ORTE_ERR_BASE - 43), ORTE_ERR_ALLOCATION_PENDING = (ORTE_ERR_BASE - 43),
ORTE_ERR_NO_PATH_TO_TARGET = (ORTE_ERR_BASE - 44), ORTE_ERR_NO_PATH_TO_TARGET = (ORTE_ERR_BASE - 44),
ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45), ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45),
ORTE_ERR_OPEN_CHANNEL_PEER_FAIL = (ORTE_ERR_BASE - 46), ORTE_ERR_OPEN_CONDUIT_FAIL = (ORTE_ERR_BASE - 46),
ORTE_ERR_OPEN_CHANNEL_PEER_REJECT = (ORTE_ERR_BASE - 47), ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 47),
ORTE_ERR_QOS_TYPE_UNSUPPORTED = (ORTE_ERR_BASE - 48), ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 48),
ORTE_ERR_QOS_ACK_WINDOW_FULL = (ORTE_ERR_BASE - 49), ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 49),
ORTE_ERR_ACK_TIMEOUT_SENDER = (ORTE_ERR_BASE - 50), ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 50),
ORTE_ERR_ACK_TIMEOUT_RECEIVER = (ORTE_ERR_BASE - 51), ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 51)
ORTE_ERR_LOST_MSG_IN_WINDOW = (ORTE_ERR_BASE - 52),
ORTE_ERR_CHANNEL_BUSY = (ORTE_ERR_BASE - 53),
ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 54),
ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55),
ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56),
ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57),
ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58),
ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 59)
}; };
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
@ -163,4 +155,3 @@ enum {
END_C_DECLS END_C_DECLS
#endif /* ORTE_CONSTANTS_H */ #endif /* ORTE_CONSTANTS_H */

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@ -223,13 +223,21 @@ int orte_ess_base_app_setup(bool db_restrict_local)
OBJ_CONSTRUCT(&transports, opal_list_t); OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
orte_mgmt_conduit = orte_rml.open_conduit(&transports); if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
error = "orte_rml_open_mgmt_conduit";
goto error;
}
OPAL_LIST_DESTRUCT(&transports); OPAL_LIST_DESTRUCT(&transports);
OBJ_CONSTRUCT(&transports, opal_list_t); OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
orte_coll_conduit = orte_rml.open_conduit(&transports); if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
error = "orte_rml_open_coll_conduit";
goto error;
}
OPAL_LIST_DESTRUCT(&transports); OPAL_LIST_DESTRUCT(&transports);
/* /*

Просмотреть файл

@ -424,13 +424,21 @@ int orte_ess_base_orted_setup(void)
OBJ_CONSTRUCT(&transports, opal_list_t); OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
orte_mgmt_conduit = orte_rml.open_conduit(&transports); if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
error = "orte_rml_open_mgmt_conduit";
goto error;
}
OPAL_LIST_DESTRUCT(&transports); OPAL_LIST_DESTRUCT(&transports);
OBJ_CONSTRUCT(&transports, opal_list_t); OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
orte_coll_conduit = orte_rml.open_conduit(&transports); if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
error = "orte_rml_open_coll_conduit";
goto error;
}
OPAL_LIST_DESTRUCT(&transports); OPAL_LIST_DESTRUCT(&transports);
/* add our contact info to our proc object */ /* add our contact info to our proc object */

Просмотреть файл

@ -355,13 +355,21 @@ static int rte_init(void)
OBJ_CONSTRUCT(&transports, opal_list_t); OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
orte_mgmt_conduit = orte_rml.open_conduit(&transports); if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
error = "orte_rml_open_mgmt_conduit";
goto error;
}
OPAL_LIST_DESTRUCT(&transports); OPAL_LIST_DESTRUCT(&transports);
OBJ_CONSTRUCT(&transports, opal_list_t); OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
orte_coll_conduit = orte_rml.open_conduit(&transports); if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
error = "orte_rml_open_coll_conduit";
goto error;
}
OPAL_LIST_DESTRUCT(&transports); OPAL_LIST_DESTRUCT(&transports);
/* /*

Просмотреть файл

@ -146,7 +146,7 @@ static int orte_rml_base_open(mca_base_open_flag_t flags)
OBJ_CONSTRUCT(&orte_rml_base.posted_recvs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.posted_recvs, opal_list_t);
OBJ_CONSTRUCT(&orte_rml_base.unmatched_msgs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.unmatched_msgs, opal_list_t);
OBJ_CONSTRUCT(&orte_rml_base.conduits, opal_pointer_array_t); OBJ_CONSTRUCT(&orte_rml_base.conduits, opal_pointer_array_t);
opal_pointer_array_init(&orte_rml_base.conduits,1,INT_MAX,1); opal_pointer_array_init(&orte_rml_base.conduits,1,INT16_MAX,1);
/* Open up all available components */ /* Open up all available components */
return mca_base_framework_components_open(&orte_rml_base_framework, flags); return mca_base_framework_components_open(&orte_rml_base_framework, flags);

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science * Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -82,10 +82,14 @@ orte_rml_conduit_t orte_rml_API_open_conduit(opal_list_t *attributes)
if (NULL != ourmod) { if (NULL != ourmod) {
/* we got an answer - store this conduit in our array */ /* we got an answer - store this conduit in our array */
rc = opal_pointer_array_add(&orte_rml_base.conduits, ourmod); rc = opal_pointer_array_add(&orte_rml_base.conduits, ourmod);
if (rc < 0) {
return ORTE_RML_CONDUIT_INVALID;
}
return rc; return rc;
} }
/* we get here if nobody could support it */ /* we get here if nobody could support it */
return ORTE_ERR_NOT_SUPPORTED; ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
return ORTE_RML_CONDUIT_INVALID;
} }

Просмотреть файл

@ -32,6 +32,8 @@
static int rml_ofi_component_open(void); static int rml_ofi_component_open(void);
static int rml_ofi_component_close(void); static int rml_ofi_component_close(void);
static int rml_ofi_component_register(void);
static int rml_ofi_component_init(void); static int rml_ofi_component_init(void);
static orte_rml_base_module_t* open_conduit(opal_list_t *attributes); static orte_rml_base_module_t* open_conduit(opal_list_t *attributes);
static orte_rml_pathway_t* query_transports(void); static orte_rml_pathway_t* query_transports(void);
@ -55,6 +57,7 @@ orte_rml_component_t mca_rml_ofi_component = {
ORTE_RELEASE_VERSION), ORTE_RELEASE_VERSION),
.mca_open_component = rml_ofi_component_open, .mca_open_component = rml_ofi_component_open,
.mca_close_component = rml_ofi_component_close, .mca_close_component = rml_ofi_component_close,
.mca_register_component_params = rml_ofi_component_register
}, },
.data = { .data = {
/* The component is checkpoint ready */ /* The component is checkpoint ready */
@ -81,6 +84,7 @@ orte_rml_ofi_module_t orte_rml_ofi = {
/* Local variables */ /* Local variables */
static bool init_done = false; static bool init_done = false;
static char *ofi_transports_supported = NULL;
static int static int
rml_ofi_component_open(void) rml_ofi_component_open(void)
@ -227,6 +231,21 @@ rml_ofi_component_close(void)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int rml_ofi_component_register(void)
{
mca_base_component_t *component = &mca_rml_ofi_component.base;
ofi_transports_supported = strdup("fabric,ethernet");
mca_base_component_var_register(component, "transports",
"Comma-delimited list of transports to support (default=\"fabric,ethernet\"",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_2,
MCA_BASE_VAR_SCOPE_LOCAL,
&ofi_transports_supported);
opal_output(0, "OFI TRANSPORTS %s", ofi_transports_supported);
return ORTE_SUCCESS;
}
void print_provider_info (struct fi_info *cur_fi ) void print_provider_info (struct fi_info *cur_fi )
{ {
//Display all the details in the fi_info structure //Display all the details in the fi_info structure
@ -279,8 +298,7 @@ static orte_rml_pathway_t* query_transports(void)
/** /**
ofi_prov [in]: the ofi ofi_prov_id that triggered the progress fn ofi_prov [in]: the ofi ofi_prov_id that triggered the progress fn
**/ **/
__opal_attribute_always_inline__ static inline int static int orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov)
orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov)
{ {
ssize_t ret; ssize_t ret;
int count=0; /* number of messages read and processed */ int count=0; /* number of messages read and processed */
@ -933,7 +951,16 @@ static orte_rml_base_module_t* make_module( int ofi_prov_id)
memcpy(mod, &orte_rml_ofi, sizeof(orte_rml_ofi_module_t)); memcpy(mod, &orte_rml_ofi, sizeof(orte_rml_ofi_module_t));
/* setup the remaining data locations in mod, associate conduit with ofi provider selected*/ /* setup the remaining data locations in mod, associate conduit with ofi provider selected*/
mod->cur_transport_id = ofi_prov_id; mod->cur_transport_id = ofi_prov_id;
/* we always go direct to our target peer, so set the routed to "direct" */
mod->api.routed = orte_routed.assign_module("direct");
if (NULL == mod->api.routed) {
/* we can't work */
opal_output_verbose(20,orte_rml_base_framework.framework_output,
"%s - Failed to get direct routed support, returning NULL ",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
free(mod);
return NULL;
}
return (orte_rml_base_module_t*)mod; return (orte_rml_base_module_t*)mod;
} }
@ -997,19 +1024,15 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
} }
} }
} }
/*[Debug] to check for daemon commn over ofi-ethernet, enable the default conduit ORTE_MGMT_CONDUIT over ofi */
if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) && if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) &&
NULL != comp_attrib) { NULL != comp_attrib) {
opal_output_verbose(20,orte_rml_base_framework.framework_output, opal_output_verbose(20,orte_rml_base_framework.framework_output,
"%s - ORTE_RML_TRANSPORT_TYPE = %s ", "%s - ORTE_RML_TRANSPORT_TYPE = %s ",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp_attrib); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp_attrib);
comps = opal_argv_split(comp_attrib, ','); comps = opal_argv_split(comp_attrib, ',');
for (i=0; NULL != comps[i]; i++) { for (i=0; 0 == i; i++) {
/* changing below to check for oob, as trying to use ofi for only mgmt conduit */ if (NULL != strstr(ofi_transports_supported, comps[i])) {
if (0 == strcmp(comps[i], "oob")) {
/* changing below to check for fabric, as trying to use ofi for only coll conduit
if (0 == strcmp(comps[i], "fabric")) { */
/*if (0 == strcmp(comps[i], "ethernet")) { */
/* we are a candidate, */ /* we are a candidate, */
opal_output_verbose(20,orte_rml_base_framework.framework_output, opal_output_verbose(20,orte_rml_base_framework.framework_output,
"%s - Forcibly returning ofi socket provider for ethernet transport request", "%s - Forcibly returning ofi socket provider for ethernet transport request",

Просмотреть файл

@ -198,6 +198,7 @@ typedef uint32_t orte_rml_tag_t;
/* Conduit ID */ /* Conduit ID */
typedef uint16_t orte_rml_conduit_t; typedef uint16_t orte_rml_conduit_t;
#define ORTE_RML_CONDUIT_INVALID 0xff
/* define an object for reporting transports */ /* define an object for reporting transports */
typedef struct { typedef struct {

Просмотреть файл

@ -195,39 +195,12 @@ int orte_err2str(int errnum, const char **errmsg)
case ORTE_ERR_OP_IN_PROGRESS: case ORTE_ERR_OP_IN_PROGRESS:
retval = "Operation in progress"; retval = "Operation in progress";
break; break;
case ORTE_ERR_OPEN_CHANNEL_PEER_FAIL: case ORTE_ERR_OPEN_CONDUIT_FAIL:
retval = "Open channel to peer failed"; retval = "Open messaging conduit failed";
break;
case ORTE_ERR_OPEN_CHANNEL_PEER_REJECT:
retval = "Open channel to peer was rejected";
break;
case ORTE_ERR_QOS_TYPE_UNSUPPORTED:
retval = "QoS type unsupported";
break;
case ORTE_ERR_QOS_ACK_WINDOW_FULL:
retval = "QoS ack window full";
break;
case ORTE_ERR_ACK_TIMEOUT_SENDER:
retval = "Send ack timed out";
break;
case ORTE_ERR_ACK_TIMEOUT_RECEIVER:
retval = "Recv ack timed out";
break;
case ORTE_ERR_LOST_MSG_IN_WINDOW:
retval = "Msg lost in window";
break;
case ORTE_ERR_CHANNEL_BUSY:
retval = "Channel busy";
break;
case ORTE_ERR_DUPLICATE_MSG:
retval = "Duplicate message";
break; break;
case ORTE_ERR_OUT_OF_ORDER_MSG: case ORTE_ERR_OUT_OF_ORDER_MSG:
retval = "Out of order message"; retval = "Out of order message";
break; break;
case ORTE_ERR_OPEN_CHANNEL_DUPLICATE:
retval = "Duplicate channel open request";
break;
case ORTE_ERR_FORCE_SELECT: case ORTE_ERR_FORCE_SELECT:
retval = "Force select"; retval = "Force select";
break; break;