diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 89b23e86fb..de6c3cbb21 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -142,20 +142,12 @@ enum { ORTE_ERR_ALLOCATION_PENDING = (ORTE_ERR_BASE - 43), ORTE_ERR_NO_PATH_TO_TARGET = (ORTE_ERR_BASE - 44), ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45), - ORTE_ERR_OPEN_CHANNEL_PEER_FAIL = (ORTE_ERR_BASE - 46), - ORTE_ERR_OPEN_CHANNEL_PEER_REJECT = (ORTE_ERR_BASE - 47), - ORTE_ERR_QOS_TYPE_UNSUPPORTED = (ORTE_ERR_BASE - 48), - ORTE_ERR_QOS_ACK_WINDOW_FULL = (ORTE_ERR_BASE - 49), - ORTE_ERR_ACK_TIMEOUT_SENDER = (ORTE_ERR_BASE - 50), - ORTE_ERR_ACK_TIMEOUT_RECEIVER = (ORTE_ERR_BASE - 51), - ORTE_ERR_LOST_MSG_IN_WINDOW = (ORTE_ERR_BASE - 52), - ORTE_ERR_CHANNEL_BUSY = (ORTE_ERR_BASE - 53), - ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 54), - ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55), - ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56), - ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57), - ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58), - ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 59) + ORTE_ERR_OPEN_CONDUIT_FAIL = (ORTE_ERR_BASE - 46), + ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 47), + ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 48), + ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 49), + ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 50), + ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 51) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) @@ -163,4 +155,3 @@ enum { END_C_DECLS #endif /* ORTE_CONSTANTS_H */ - diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 5fff0ce3d0..79e3a1fe48 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -12,7 +12,7 @@ * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. @@ -223,13 +223,21 @@ int orte_ess_base_app_setup(bool db_restrict_local) OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); - orte_mgmt_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_mgmt_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); - orte_coll_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_coll_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); /* diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index a3e3e2d44f..167c308ae1 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -424,13 +424,21 @@ int orte_ess_base_orted_setup(void) OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); - orte_mgmt_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_mgmt_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); - orte_coll_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_coll_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); /* add our contact info to our proc object */ diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 6e5b221d4e..27443e0ff3 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -355,13 +355,21 @@ static int rte_init(void) OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); - orte_mgmt_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_mgmt_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); - orte_coll_conduit = orte_rml.open_conduit(&transports); + if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { + ret = ORTE_ERR_OPEN_CONDUIT_FAIL; + error = "orte_rml_open_coll_conduit"; + goto error; + } OPAL_LIST_DESTRUCT(&transports); /* diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c index 803bf2db97..f0916b7bb2 100644 --- a/orte/mca/rml/base/rml_base_frame.c +++ b/orte/mca/rml/base/rml_base_frame.c @@ -146,7 +146,7 @@ static int orte_rml_base_open(mca_base_open_flag_t flags) OBJ_CONSTRUCT(&orte_rml_base.posted_recvs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.unmatched_msgs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.conduits, opal_pointer_array_t); - opal_pointer_array_init(&orte_rml_base.conduits,1,INT_MAX,1); + opal_pointer_array_init(&orte_rml_base.conduits,1,INT16_MAX,1); /* Open up all available components */ return mca_base_framework_components_open(&orte_rml_base_framework, flags); diff --git a/orte/mca/rml/base/rml_base_stubs.c b/orte/mca/rml/base/rml_base_stubs.c index 9197e10423..25fcef516d 100644 --- a/orte/mca/rml/base/rml_base_stubs.c +++ b/orte/mca/rml/base/rml_base_stubs.c @@ -5,7 +5,7 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -82,10 +82,14 @@ orte_rml_conduit_t orte_rml_API_open_conduit(opal_list_t *attributes) if (NULL != ourmod) { /* we got an answer - store this conduit in our array */ rc = opal_pointer_array_add(&orte_rml_base.conduits, ourmod); + if (rc < 0) { + return ORTE_RML_CONDUIT_INVALID; + } return rc; } /* we get here if nobody could support it */ - return ORTE_ERR_NOT_SUPPORTED; + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + return ORTE_RML_CONDUIT_INVALID; } diff --git a/orte/mca/rml/ofi/rml_ofi_component.c b/orte/mca/rml/ofi/rml_ofi_component.c index 2e0213e495..3a34b4171a 100644 --- a/orte/mca/rml/ofi/rml_ofi_component.c +++ b/orte/mca/rml/ofi/rml_ofi_component.c @@ -1006,10 +1006,8 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes) comps = opal_argv_split(comp_attrib, ','); for (i=0; NULL != comps[i]; i++) { /* changing below to check for oob, as trying to use ofi for only mgmt conduit */ - if (0 == strcmp(comps[i], "oob")) { - /* changing below to check for fabric, as trying to use ofi for only coll conduit - if (0 == strcmp(comps[i], "fabric")) { */ - /*if (0 == strcmp(comps[i], "ethernet")) { */ + if (0 == strcasecmp(comps[i], "fabric") || + 0 == strcasecmp(comps[i], "ethernet")) { /* we are a candidate, */ opal_output_verbose(20,orte_rml_base_framework.framework_output, "%s - Forcibly returning ofi socket provider for ethernet transport request", diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 9efe841641..5cfbb07072 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -198,6 +198,7 @@ typedef uint32_t orte_rml_tag_t; /* Conduit ID */ typedef uint16_t orte_rml_conduit_t; +#define ORTE_RML_CONDUIT_INVALID 0xff /* define an object for reporting transports */ typedef struct { diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 801373cb66..30fc3c5182 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -195,39 +195,12 @@ int orte_err2str(int errnum, const char **errmsg) case ORTE_ERR_OP_IN_PROGRESS: retval = "Operation in progress"; break; - case ORTE_ERR_OPEN_CHANNEL_PEER_FAIL: - retval = "Open channel to peer failed"; - break; - case ORTE_ERR_OPEN_CHANNEL_PEER_REJECT: - retval = "Open channel to peer was rejected"; - break; - case ORTE_ERR_QOS_TYPE_UNSUPPORTED: - retval = "QoS type unsupported"; - break; - case ORTE_ERR_QOS_ACK_WINDOW_FULL: - retval = "QoS ack window full"; - break; - case ORTE_ERR_ACK_TIMEOUT_SENDER: - retval = "Send ack timed out"; - break; - case ORTE_ERR_ACK_TIMEOUT_RECEIVER: - retval = "Recv ack timed out"; - break; - case ORTE_ERR_LOST_MSG_IN_WINDOW: - retval = "Msg lost in window"; - break; - case ORTE_ERR_CHANNEL_BUSY: - retval = "Channel busy"; - break; - case ORTE_ERR_DUPLICATE_MSG: - retval = "Duplicate message"; + case ORTE_ERR_OPEN_CONDUIT_FAIL: + retval = "Open messaging conduit failed"; break; case ORTE_ERR_OUT_OF_ORDER_MSG: retval = "Out of order message"; break; - case ORTE_ERR_OPEN_CHANNEL_DUPLICATE: - retval = "Duplicate channel open request"; - break; case ORTE_ERR_FORCE_SELECT: retval = "Force select"; break;