diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 89b23e86fb..de6c3cbb21 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -142,20 +142,12 @@ enum { ORTE_ERR_ALLOCATION_PENDING = (ORTE_ERR_BASE - 43), ORTE_ERR_NO_PATH_TO_TARGET = (ORTE_ERR_BASE - 44), ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45), - ORTE_ERR_OPEN_CHANNEL_PEER_FAIL = (ORTE_ERR_BASE - 46), - ORTE_ERR_OPEN_CHANNEL_PEER_REJECT = (ORTE_ERR_BASE - 47), - ORTE_ERR_QOS_TYPE_UNSUPPORTED = (ORTE_ERR_BASE - 48), - ORTE_ERR_QOS_ACK_WINDOW_FULL = (ORTE_ERR_BASE - 49), - ORTE_ERR_ACK_TIMEOUT_SENDER = (ORTE_ERR_BASE - 50), - ORTE_ERR_ACK_TIMEOUT_RECEIVER = (ORTE_ERR_BASE - 51), - ORTE_ERR_LOST_MSG_IN_WINDOW = (ORTE_ERR_BASE - 52), - ORTE_ERR_CHANNEL_BUSY = (ORTE_ERR_BASE - 53), - ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 54), - ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55), - ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56), - ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57), - ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58), - ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 59) + ORTE_ERR_OPEN_CONDUIT_FAIL = (ORTE_ERR_BASE - 46), + ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 47), + ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 48), + ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 49), + ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 50), + ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 51) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) @@ -163,4 +155,3 @@ enum { END_C_DECLS #endif /* ORTE_CONSTANTS_H */ - diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 5fff0ce3d0..79e3a1fe48 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -12,7 +12,7 @@ * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. @@ -223,13 +223,21 @@ int orte_ess_base_app_setup(bool db_restrict_local) OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); - orte_mgmt_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_mgmt_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); - orte_coll_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_coll_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); /* diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index a3e3e2d44f..167c308ae1 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -424,13 +424,21 @@ int orte_ess_base_orted_setup(void) OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); - orte_mgmt_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_mgmt_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); - orte_coll_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_coll_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); /* add our contact info to our proc object */ diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index d9cc5503cd..f240daaa38 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -355,13 +355,21 @@ static int rte_init(void) OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); - orte_mgmt_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_mgmt_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); - orte_coll_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_coll_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); /* diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c index 803bf2db97..f0916b7bb2 100644 --- a/orte/mca/rml/base/rml_base_frame.c +++ b/orte/mca/rml/base/rml_base_frame.c @@ -146,7 +146,7 @@ static int orte_rml_base_open(mca_base_open_flag_t flags) OBJ_CONSTRUCT(&orte_rml_base.posted_recvs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.unmatched_msgs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.conduits, opal_pointer_array_t); - opal_pointer_array_init(&orte_rml_base.conduits,1,INT_MAX,1); + opal_pointer_array_init(&orte_rml_base.conduits,1,INT16_MAX,1); /* Open up all available components */ return mca_base_framework_components_open(&orte_rml_base_framework, flags); diff --git a/orte/mca/rml/base/rml_base_stubs.c b/orte/mca/rml/base/rml_base_stubs.c index 9197e10423..25fcef516d 100644 --- a/orte/mca/rml/base/rml_base_stubs.c +++ b/orte/mca/rml/base/rml_base_stubs.c @@ -5,7 +5,7 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -82,10 +82,14 @@ orte_rml_conduit_t orte_rml_API_open_conduit(opal_list_t *attributes) if (NULL != ourmod) { /* we got an answer - store this conduit in our array */ rc = opal_pointer_array_add(&orte_rml_base.conduits, ourmod); + if (rc < 0) { + return ORTE_RML_CONDUIT_INVALID; + } return rc; } /* we get here if nobody could support it */ - return ORTE_ERR_NOT_SUPPORTED; + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + return ORTE_RML_CONDUIT_INVALID; } diff --git a/orte/mca/rml/ofi/rml_ofi_component.c b/orte/mca/rml/ofi/rml_ofi_component.c index 2e0213e495..99cc420d8d 100644 --- a/orte/mca/rml/ofi/rml_ofi_component.c +++ b/orte/mca/rml/ofi/rml_ofi_component.c @@ -32,6 +32,8 @@ static int rml_ofi_component_open(void); static int rml_ofi_component_close(void); +static int rml_ofi_component_register(void); + static int rml_ofi_component_init(void); static orte_rml_base_module_t* open_conduit(opal_list_t *attributes); static orte_rml_pathway_t* query_transports(void); @@ -55,6 +57,7 @@ orte_rml_component_t mca_rml_ofi_component = { ORTE_RELEASE_VERSION), .mca_open_component = rml_ofi_component_open, .mca_close_component = rml_ofi_component_close, + .mca_register_component_params = rml_ofi_component_register }, .data = { /* The component is checkpoint ready */ @@ -81,6 +84,7 @@ orte_rml_ofi_module_t orte_rml_ofi = { /* Local variables */ static bool init_done = false; +static char *ofi_transports_supported = NULL; static int rml_ofi_component_open(void) @@ -227,6 +231,21 @@ rml_ofi_component_close(void) return ORTE_SUCCESS; } +static int rml_ofi_component_register(void) +{ + mca_base_component_t *component = &mca_rml_ofi_component.base; + + ofi_transports_supported = strdup("fabric,ethernet"); + mca_base_component_var_register(component, "transports", + "Comma-delimited list of transports to support (default=\"fabric,ethernet\"", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_LOCAL, + &ofi_transports_supported); + opal_output(0, "OFI TRANSPORTS %s", ofi_transports_supported); + return ORTE_SUCCESS; +} + void print_provider_info (struct fi_info *cur_fi ) { //Display all the details in the fi_info structure @@ -279,8 +298,7 @@ static orte_rml_pathway_t* query_transports(void) /** ofi_prov [in]: the ofi ofi_prov_id that triggered the progress fn **/ -__opal_attribute_always_inline__ static inline int -orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov) +static int orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov) { ssize_t ret; int count=0; /* number of messages read and processed */ @@ -933,7 +951,16 @@ static orte_rml_base_module_t* make_module( int ofi_prov_id) memcpy(mod, &orte_rml_ofi, sizeof(orte_rml_ofi_module_t)); /* setup the remaining data locations in mod, associate conduit with ofi provider selected*/ mod->cur_transport_id = ofi_prov_id; - + /* we always go direct to our target peer, so set the routed to "direct" */ + mod->api.routed = orte_routed.assign_module("direct"); + if (NULL == mod->api.routed) { + /* we can't work */ + opal_output_verbose(20,orte_rml_base_framework.framework_output, + "%s - Failed to get direct routed support, returning NULL ", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + free(mod); + return NULL; + } return (orte_rml_base_module_t*)mod; } @@ -997,19 +1024,15 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes) } } } - /*[Debug] to check for daemon commn over ofi-ethernet, enable the default conduit ORTE_MGMT_CONDUIT over ofi */ + if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) && NULL != comp_attrib) { opal_output_verbose(20,orte_rml_base_framework.framework_output, "%s - ORTE_RML_TRANSPORT_TYPE = %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp_attrib); comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - /* changing below to check for oob, as trying to use ofi for only mgmt conduit */ - if (0 == strcmp(comps[i], "oob")) { - /* changing below to check for fabric, as trying to use ofi for only coll conduit - if (0 == strcmp(comps[i], "fabric")) { */ - /*if (0 == strcmp(comps[i], "ethernet")) { */ + for (i=0; 0 == i; i++) { + if (NULL != strstr(ofi_transports_supported, comps[i])) { /* we are a candidate, */ opal_output_verbose(20,orte_rml_base_framework.framework_output, "%s - Forcibly returning ofi socket provider for ethernet transport request", diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 9efe841641..5cfbb07072 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -198,6 +198,7 @@ typedef uint32_t orte_rml_tag_t; /* Conduit ID */ typedef uint16_t orte_rml_conduit_t; +#define ORTE_RML_CONDUIT_INVALID 0xff /* define an object for reporting transports */ typedef struct { diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 801373cb66..30fc3c5182 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -195,39 +195,12 @@ int orte_err2str(int errnum, const char **errmsg) case ORTE_ERR_OP_IN_PROGRESS: retval = "Operation in progress"; break; - case ORTE_ERR_OPEN_CHANNEL_PEER_FAIL: - retval = "Open channel to peer failed"; - break; - case ORTE_ERR_OPEN_CHANNEL_PEER_REJECT: - retval = "Open channel to peer was rejected"; - break; - case ORTE_ERR_QOS_TYPE_UNSUPPORTED: - retval = "QoS type unsupported"; - break; - case ORTE_ERR_QOS_ACK_WINDOW_FULL: - retval = "QoS ack window full"; - break; - case ORTE_ERR_ACK_TIMEOUT_SENDER: - retval = "Send ack timed out"; - break; - case ORTE_ERR_ACK_TIMEOUT_RECEIVER: - retval = "Recv ack timed out"; - break; - case ORTE_ERR_LOST_MSG_IN_WINDOW: - retval = "Msg lost in window"; - break; - case ORTE_ERR_CHANNEL_BUSY: - retval = "Channel busy"; - break; - case ORTE_ERR_DUPLICATE_MSG: - retval = "Duplicate message"; + case ORTE_ERR_OPEN_CONDUIT_FAIL: + retval = "Open messaging conduit failed"; break; case ORTE_ERR_OUT_OF_ORDER_MSG: retval = "Out of order message"; break; - case ORTE_ERR_OPEN_CHANNEL_DUPLICATE: - retval = "Duplicate channel open request"; - break; case ORTE_ERR_FORCE_SELECT: retval = "Force select"; break;