diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index e793bf86ae..5fff0ce3d0 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -54,6 +54,7 @@ #include "orte/mca/grpcomm/base/base.h" #include "orte/mca/oob/base/base.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/filem/base/base.h" #include "orte/mca/errmgr/base/base.h" @@ -148,6 +149,17 @@ int orte_ess_base_app_setup(bool db_restrict_local) "output-", NULL, NULL); } /* Setup the communication infrastructure */ + /* Routed system */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_routed_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_routed_base_select"; + goto error; + } /* * OOB Layer */ @@ -172,29 +184,40 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_rml_base_select"; goto error; } + /* if we have info on the HNP and local daemon, process it */ + if (NULL != orte_process_info.my_hnp_uri) { + /* we have to set the HNP's name, even though we won't route messages directly + * to it. This is required to ensure that we -do- send messages to the correct + * HNP name + */ + if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, + ORTE_PROC_MY_HNP, NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_rml_parse_HNP"; + goto error; + } + } + if (NULL != orte_process_info.my_daemon_uri) { + /* extract the daemon's name so we can update the routing table */ + if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, + ORTE_PROC_MY_DAEMON, NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_rml_parse_daemon"; + goto error; + } + /* Set the contact info in the RML - this won't actually establish + * the connection, but just tells the RML how to reach the daemon + * if/when we attempt to send to it + */ + orte_rml.set_contact_info(orte_process_info.my_daemon_uri); + } + /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - /* Routed system */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_select"; - goto error; - } - /* setup the routed info */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } /* get a conduit for our use - we never route IO over fabric */ OBJ_CONSTRUCT(&transports, opal_list_t); diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 3fa0a02078..e016b87e54 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -47,6 +47,7 @@ #include "orte/mca/rtc/base/base.h" #include "orte/mca/rml/base/base.h" +#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/routed/routed.h" #include "orte/mca/oob/base/base.h" @@ -386,6 +387,17 @@ int orte_ess_base_orted_setup(char **hosts) } /* Setup the communication infrastructure */ + /* Routed system */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_routed_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_routed_base_select"; + goto error; + } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; @@ -407,6 +419,21 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } + if (NULL != orte_process_info.my_hnp_uri) { + /* extract the HNP's name so we can update the routing table */ + if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, + ORTE_PROC_MY_HNP, NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_rml_parse_HNP"; + goto error; + } + /* Set the contact info in the RML - this won't actually establish + * the connection, but just tells the RML how to reach the HNP + * if/when we attempt to send to it + */ + orte_rml.set_contact_info(orte_process_info.my_hnp_uri); + } + /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { ORTE_ERROR_LOG(ret); @@ -420,23 +447,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_errmgr_base_select"; goto error; } - /* Routed system */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_rml_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_select"; - goto error; - } - /* setup the routed info */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } /* get a conduit for our use - we never route IO over fabric */ OBJ_CONSTRUCT(&transports, opal_list_t); @@ -533,12 +543,6 @@ int orte_ess_base_orted_setup(char **hosts) } } - /* setup the routed info */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index f07fb7a045..80efd37980 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -157,13 +157,6 @@ int orte_ess_base_tool_setup(void) goto error; } - /* setup the routed info for all components */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } - /* setup I/O forwarding system - must come after we init routes */ if (NULL != orte_process_info.my_hnp_uri) { /* only do this if we were given an HNP */ diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 60005d45a2..14699ae42d 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -633,13 +633,6 @@ static int rte_init(void) /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); - /* setup the routed info */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } - /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { /* the server code already barked, so let's be quiet */ diff --git a/orte/mca/routed/base/base.h b/orte/mca/routed/base/base.h index 9dd12cd5c1..79920f59ef 100644 --- a/orte/mca/routed/base/base.h +++ b/orte/mca/routed/base/base.h @@ -53,8 +53,6 @@ ORTE_DECLSPEC int orte_routed_base_update_route(char *module, orte_process_name_ orte_process_name_t *route); ORTE_DECLSPEC orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t *target); -ORTE_DECLSPEC int orte_routed_base_init_routes(char *module, - orte_jobid_t job, opal_buffer_t *ndat); ORTE_DECLSPEC int orte_routed_base_route_lost(char *module, const orte_process_name_t *route); ORTE_DECLSPEC bool orte_routed_base_route_is_defined(char *module, diff --git a/orte/mca/routed/base/routed_base_fns.c b/orte/mca/routed/base/routed_base_fns.c index 906860909f..e27aba4332 100644 --- a/orte/mca/routed/base/routed_base_fns.c +++ b/orte/mca/routed/base/routed_base_fns.c @@ -125,25 +125,6 @@ orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t return *ORTE_NAME_INVALID; } -int orte_routed_base_init_routes(char *module, - orte_jobid_t job, opal_buffer_t *ndat) -{ - orte_routed_base_active_t *active; - int rc; - - OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) { - if (NULL == module || - 0 == strcmp(module, active->component->base_version.mca_component_name)) { - if (NULL != active->module->init_routes) { - if (ORTE_SUCCESS != (rc = active->module->init_routes(job, ndat))) { - return rc; - } - } - } - } - return ORTE_SUCCESS; -} - int orte_routed_base_route_lost(char *module, const orte_process_name_t *route) { orte_routed_base_active_t *active; diff --git a/orte/mca/routed/base/routed_base_frame.c b/orte/mca/routed/base/routed_base_frame.c index 9649329c66..5f90e8f154 100644 --- a/orte/mca/routed/base/routed_base_frame.c +++ b/orte/mca/routed/base/routed_base_frame.c @@ -45,7 +45,6 @@ orte_routed_API_t orte_routed = { .delete_route = orte_routed_base_delete_route, .update_route = orte_routed_base_update_route, .get_route = orte_routed_base_get_route, - .init_routes = orte_routed_base_init_routes, .route_lost = orte_routed_base_route_lost, .route_is_defined = orte_routed_base_route_is_defined, .set_lifeline = orte_routed_base_set_lifeline, diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 73663f634e..7762d01f2a 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -46,7 +46,6 @@ static int delete_route(orte_process_name_t *proc); static int update_route(orte_process_name_t *target, orte_process_name_t *route); static orte_process_name_t get_route(orte_process_name_t *target); -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); static void update_routing_plan(void); @@ -64,7 +63,6 @@ orte_routed_module_t orte_routed_binomial_module = { .delete_route = delete_route, .update_route = update_route, .get_route = get_route, - .init_routes = init_routes, .route_lost = route_lost, .route_is_defined = route_is_defined, .set_lifeline = set_lifeline, @@ -89,10 +87,29 @@ static int init(void) { lifeline = NULL; + if (ORTE_PROC_IS_DAEMON) { + /* if we are using static ports, set my lifeline to point at my parent */ + if (orte_static_ports) { + lifeline = ORTE_PROC_MY_PARENT; + } else { + /* set our lifeline to the HNP - we will abort if that connection is lost */ + lifeline = ORTE_PROC_MY_HNP; + } + ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; + } else if (ORTE_PROC_IS_APP) { + /* if we don't have a designated daemon, just + * disqualify ourselves */ + if (NULL == orte_process_info.my_daemon_uri) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + /* set our lifeline to the local daemon - we will abort if this connection is lost */ + lifeline = ORTE_PROC_MY_DAEMON; + orte_routing_is_enabled = true; + } + /* setup the list of children */ OBJ_CONSTRUCT(&my_children, opal_list_t); num_children = 0; - ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; return ORTE_SUCCESS; } @@ -295,180 +312,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) return *ret; } -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) -{ - /* the binomial module routes all proc communications through - * the local daemon. Daemons must identify which of their - * daemon-peers is "hosting" the specified recipient and - * route the message to that daemon. Daemon contact info - * is handled elsewhere, so all we need to do here is - * ensure that the procs are told to route through their - * local daemon, and that daemons are told how to route - * for each proc - */ - int rc; - - /* if I am a tool, then I stand alone - there is nothing to do */ - if (ORTE_PROC_IS_TOOL) { - return ORTE_SUCCESS; - } - - /* if I am a daemon or HNP, then I have to extract the routing info for this job - * from the data sent to me for launch and update the routing tables to - * point at the daemon for each proc - */ - if (ORTE_PROC_IS_DAEMON) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_binomial: init routes for daemon job %s\n\thnp_uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); - - if (NULL == ndat) { - /* indicates this is being called during orte_init. - * Get the HNP's name for possible later use - */ - if (NULL == orte_process_info.my_hnp_uri) { - /* fatal error */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* set the contact info into the hash table */ - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - - /* if we are using static ports, set my lifeline to point at my parent */ - if (orte_static_ports) { - lifeline = ORTE_PROC_MY_PARENT; - } else { - /* set our lifeline to the HNP - we will abort if that connection is lost */ - lifeline = ORTE_PROC_MY_HNP; - } - - /* daemons will send their contact info back to the HNP as - * part of the message confirming they are read to go. HNP's - * load their contact info during orte_init - */ - } else { - /* ndat != NULL means we are getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, - "%s routed_binomial: completed init routes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; - } - - - if (ORTE_PROC_IS_HNP) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_binomial: init routes for HNP job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - if (NULL == ndat) { - /* the HNP has no lifeline */ - lifeline = NULL; - } else { - /* if this is for my own jobid, then I am getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_PROC_MY_NAME->jobid == job) { - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } else { - /* if not, then I need to process the callback */ - if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - - return ORTE_SUCCESS; - } - - { /* MUST BE A PROC */ - /* if ndat=NULL, then we are being called during orte_init. In this - * case, we need to setup a few critical pieces of info - */ - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_binomial: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, - (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); - - /* if we are a singleton and we have not spawned our - * supporting HNP, then we don't route and don't need - * the corresponding URIs - */ - if ((orte_process_info.proc_type & ORTE_PROC_SINGLETON) && - !orte_routing_is_enabled) { - return ORTE_SUCCESS; - } - - if (NULL == orte_process_info.my_daemon_uri) { - /* in this module, we absolutely MUST have this information - if - * we didn't get it, then error out - */ - opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: This is a fatal condition when the binomial router", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: has been selected - either select the unity router", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_ERR_FATAL; - } - - /* we have to set the HNP's name, even though we won't route messages directly - * to it. This is required to ensure that we -do- send messages to the correct - * HNP name - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* extract the daemon's name so we can update the routing table */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, - ORTE_PROC_MY_DAEMON, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* Set the contact info in the RML - this won't actually establish - * the connection, but just tells the RML how to reach the daemon - * if/when we attempt to send to it - */ - orte_rml.set_contact_info(orte_process_info.my_daemon_uri); - - /* set our lifeline to the local daemon - we will abort if this connection is lost */ - lifeline = ORTE_PROC_MY_DAEMON; - orte_routing_is_enabled = true; - return ORTE_SUCCESS; - } -} - static int route_lost(const orte_process_name_t *route) { opal_list_item_t *item; @@ -700,16 +543,6 @@ static int binomial_ft_event(int state) /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * Re-exchange the routes - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - exit_status = ret; - goto cleanup; - } - } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } diff --git a/orte/mca/routed/debruijn/routed_debruijn.c b/orte/mca/routed/debruijn/routed_debruijn.c index 1c3df2e543..4545fcae77 100644 --- a/orte/mca/routed/debruijn/routed_debruijn.c +++ b/orte/mca/routed/debruijn/routed_debruijn.c @@ -45,7 +45,6 @@ static int delete_route(orte_process_name_t *proc); static int update_route(orte_process_name_t *target, orte_process_name_t *route); static orte_process_name_t get_route(orte_process_name_t *target); -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); static void update_routing_plan(void); @@ -63,7 +62,6 @@ orte_routed_module_t orte_routed_debruijn_module = { .delete_route = delete_route, .update_route = update_route, .get_route = get_route, - .init_routes = init_routes, .route_lost = route_lost, .route_is_defined = route_is_defined, .set_lifeline = set_lifeline, @@ -90,9 +88,28 @@ static int init(void) { lifeline = NULL; + if (ORTE_PROC_IS_DAEMON) { + /* if we are using static ports, set my lifeline to point at my parent */ + if (orte_static_ports) { + lifeline = ORTE_PROC_MY_PARENT; + } else { + /* set our lifeline to the HNP - we will abort if that connection is lost */ + lifeline = ORTE_PROC_MY_HNP; + } + ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; + } else if (ORTE_PROC_IS_APP) { + /* if we don't have a designated daemon, just + * disqualify ourselves */ + if (NULL == orte_process_info.my_daemon_uri) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + /* set our lifeline to the local daemon - we will abort if this connection is lost */ + lifeline = ORTE_PROC_MY_DAEMON; + orte_routing_is_enabled = true; + } + /* setup the list of children */ OBJ_CONSTRUCT(&my_children, opal_list_t); - ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; return ORTE_SUCCESS; } @@ -283,171 +300,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) return ret; } -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) -{ - /* the debruijn module routes all proc communications through - * the local daemon. Daemons must identify which of their - * daemon-peers is "hosting" the specified recipient and - * route the message to that daemon. Daemon contact info - * is handled elsewhere, so all we need to do here is - * ensure that the procs are told to route through their - * local daemon, and that daemons are told how to route - * for each proc - */ - int rc; - - /* if I am a tool, then I stand alone - there is nothing to do */ - if (ORTE_PROC_IS_TOOL) { - return ORTE_SUCCESS; - } - - /* if I am a daemon or HNP, then I have to extract the routing info for this job - * from the data sent to me for launch and update the routing tables to - * point at the daemon for each proc - */ - if (ORTE_PROC_IS_DAEMON) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_debruijn: init routes for daemon job %s\n\thnp_uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); - - if (NULL == ndat) { - /* indicates this is being called during orte_init. - * Get the HNP's name for possible later use - */ - if (NULL == orte_process_info.my_hnp_uri) { - /* fatal error */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* set the contact info into the hash table */ - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - - /* if we are using static ports, set my lifeline to point at my parent */ - if (orte_static_ports) { - lifeline = ORTE_PROC_MY_PARENT; - } else { - /* set our lifeline to the HNP - we will abort if that connection is lost */ - lifeline = ORTE_PROC_MY_HNP; - } - - /* daemons will send their contact info back to the HNP as - * part of the message confirming they are read to go. HNP's - * load their contact info during orte_init - */ - } else { - /* ndat != NULL means we are getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, - "%s routed_debruijn: completed init routes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; - } - - - if (ORTE_PROC_IS_HNP) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_debruijn: init routes for HNP job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - if (NULL == ndat) { - /* the HNP has no lifeline */ - lifeline = NULL; - } else { - /* if this is for my own jobid, then I am getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_PROC_MY_NAME->jobid == job) { - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } else { - /* if not, then I need to process the callback */ - if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - - return ORTE_SUCCESS; - } - - { /* MUST BE A PROC */ - /* if ndat=NULL, then we are being called during orte_init. In this - * case, we need to setup a few critical pieces of info - */ - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_debruijn: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, - (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); - - if (NULL == orte_process_info.my_daemon_uri) { - /* in this module, we absolutely MUST have this information - if - * we didn't get it, then error out - */ - opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: This is a fatal condition when the debruijn router", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: has been selected - either select the unity router", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_ERR_FATAL; - } - - /* we have to set the HNP's name, even though we won't route messages directly - * to it. This is required to ensure that we -do- send messages to the correct - * HNP name - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* extract the daemon's name so we can update the routing table */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, - ORTE_PROC_MY_DAEMON, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* Set the contact info in the RML - this won't actually establish - * the connection, but just tells the RML how to reach the daemon - * if/when we attempt to send to it - */ - orte_rml.set_contact_info(orte_process_info.my_daemon_uri); - - /* set our lifeline to the local daemon - we will abort if this connection is lost */ - lifeline = ORTE_PROC_MY_DAEMON; - - return ORTE_SUCCESS; - } -} - static int route_lost(const orte_process_name_t *route) { opal_list_item_t *item; @@ -615,16 +467,6 @@ static int debruijn_ft_event(int state) /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * Re-exchange the routes - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - exit_status = ret; - goto cleanup; - } - } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index 20b010ed98..9024f62f07 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -37,7 +37,6 @@ static int delete_route(orte_process_name_t *proc); static int update_route(orte_process_name_t *target, orte_process_name_t *route); static orte_process_name_t get_route(orte_process_name_t *target); -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); static void update_routing_plan(void); @@ -55,7 +54,6 @@ orte_routed_module_t orte_routed_direct_module = { .delete_route = delete_route, .update_route = update_route, .get_route = get_route, - .init_routes = init_routes, .route_lost = route_lost, .route_is_defined = route_is_defined, .set_lifeline = set_lifeline, @@ -75,7 +73,31 @@ static opal_list_t my_children; static int init(void) { + lifeline = NULL; + + if (ORTE_PROC_IS_DAEMON) { + /* if we are using static ports, set my lifeline to point at my parent */ + if (orte_static_ports) { + lifeline = ORTE_PROC_MY_PARENT; + } else { + /* set our lifeline to the HNP - we will abort if that connection is lost */ + lifeline = ORTE_PROC_MY_HNP; + } + ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; + } else if (ORTE_PROC_IS_APP) { + /* if we don't have a designated daemon, just + * disqualify ourselves */ + if (NULL == orte_process_info.my_daemon_uri) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + /* set our lifeline to the local daemon - we will abort if this connection is lost */ + lifeline = ORTE_PROC_MY_DAEMON; + orte_routing_is_enabled = true; + } + + /* setup the list of children */ OBJ_CONSTRUCT(&my_children, opal_list_t); + return ORTE_SUCCESS; } @@ -189,122 +211,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) return *ret; } - -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) -{ - int rc; - - /* if I am a tool, then I stand alone - there is nothing to do */ - if (ORTE_PROC_IS_TOOL) { - return ORTE_SUCCESS; - } - - /* if I am a daemon or HNP, then I have to extract the routing info for this job - * from the data sent to me for launch and update the routing tables to - * point at the daemon for each proc - */ - if (ORTE_PROC_IS_DAEMON) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s direct: init routes for daemon job %s\n\thnp_uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); - - if (NULL == ndat) { - /* indicates this is being called during orte_init. - * Get the HNP's name for possible later use - */ - if (NULL == orte_process_info.my_hnp_uri) { - /* fatal error */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* set the contact info into the hash table */ - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - /* the HNP is my lifeline */ - lifeline = ORTE_PROC_MY_HNP; - - /* daemons will send their contact info back to the HNP as - * part of the message confirming they are read to go. HNP's - * load their contact info during orte_init - */ - } else { - /* ndat != NULL means we are getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, - "%s routed_direct: completed init routes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; - } - - - if (ORTE_PROC_IS_HNP) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_direct: init routes for HNP job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - if (NULL != ndat) { - /* if this is for my own jobid, then I am getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_PROC_MY_NAME->jobid == job) { - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - - return ORTE_SUCCESS; - } - - /*** MUST BE A PROC ***/ - /* if we were direct launched, there is nothing we need to do. If we - * were launched by mpirun, then we need to set the HNP and daemon info */ - if (NULL != orte_process_info.my_hnp_uri) { - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* we don't set the HNP's contact info as we don't need it - we - * only contact our local daemon, which might be the HNP (in which - * case it will have also been passed as our daemon uri) */ - } - - if (NULL != orte_process_info.my_daemon_uri) { - /* extract the daemon's name so we can update the routing table */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, - ORTE_PROC_MY_DAEMON, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - orte_rml.set_contact_info(orte_process_info.my_daemon_uri); - /* my daemon is my lifeline */ - lifeline = ORTE_PROC_MY_DAEMON; - } - return ORTE_SUCCESS; -} - static int route_lost(const orte_process_name_t *route) { opal_list_item_t *item; @@ -442,16 +348,6 @@ static int direct_ft_event(int state) /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * Re-exchange the routes - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - exit_status = ret; - goto cleanup; - } - } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index 1b1e135c12..321b0482c3 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -47,7 +47,6 @@ static int delete_route(orte_process_name_t *proc); static int update_route(orte_process_name_t *target, orte_process_name_t *route); static orte_process_name_t get_route(orte_process_name_t *target); -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); static void update_routing_plan(void); @@ -65,7 +64,6 @@ orte_routed_module_t orte_routed_radix_module = { .delete_route = delete_route, .update_route = update_route, .get_route = get_route, - .init_routes = init_routes, .route_lost = route_lost, .route_is_defined = route_is_defined, .set_lifeline = set_lifeline, @@ -90,10 +88,29 @@ static int init(void) { lifeline = NULL; + if (ORTE_PROC_IS_DAEMON) { + /* if we are using static ports, set my lifeline to point at my parent */ + if (orte_static_ports) { + lifeline = ORTE_PROC_MY_PARENT; + } else { + /* set our lifeline to the HNP - we will abort if that connection is lost */ + lifeline = ORTE_PROC_MY_HNP; + } + ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; + } else if (ORTE_PROC_IS_APP) { + /* if we don't have a designated daemon, just + * disqualify ourselves */ + if (NULL == orte_process_info.my_daemon_uri) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + /* set our lifeline to the local daemon - we will abort if this connection is lost */ + lifeline = ORTE_PROC_MY_DAEMON; + orte_routing_is_enabled = true; + } + /* setup the list of children */ OBJ_CONSTRUCT(&my_children, opal_list_t); num_children = 0; - ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; return ORTE_SUCCESS; } @@ -301,177 +318,6 @@ found: return *ret; } -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) -{ - /* the radix module routes all proc communications through - * the local daemon. Daemons must identify which of their - * daemon-peers is "hosting" the specified recipient and - * route the message to that daemon. Daemon contact info - * is handled elsewhere, so all we need to do here is - * ensure that the procs are told to route through their - * local daemon, and that daemons are told how to route - * for each proc - */ - int rc; - - /* if I am a tool, then I stand alone - there is nothing to do */ - if (ORTE_PROC_IS_TOOL) { - return ORTE_SUCCESS; - } - - /* if I am a daemon or HNP, then I have to extract the routing info for this job - * from the data sent to me for launch and update the routing tables to - * point at the daemon for each proc - */ - if (ORTE_PROC_IS_DAEMON) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_radix: init routes for daemon job %s\n\thnp_uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); - - if (NULL == ndat) { - /* indicates this is being called during orte_init. - * Get the HNP's name for possible later use - */ - if (NULL == orte_process_info.my_hnp_uri) { - /* fatal error */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* set the contact info into the hash table */ - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - - /* if we are using static ports, set my lifeline to point at my parent */ - if (orte_static_ports) { - lifeline = ORTE_PROC_MY_PARENT; - } else { - /* set our lifeline to the HNP - we will abort if that connection is lost */ - lifeline = ORTE_PROC_MY_HNP; - } - - /* daemons will send their contact info back to the HNP as - * part of the message confirming they are read to go. HNP's - * load their contact info during orte_init - */ - } else { - /* ndat != NULL means we are getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, - "%s routed_radix: completed init routes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; - } - - - if (ORTE_PROC_IS_HNP) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_radix: init routes for HNP job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - if (NULL == ndat) { - /* the HNP has no lifeline */ - lifeline = NULL; - } else { - /* if this is for my own jobid, then I am getting an update of RML info - * for the daemons - so update our contact info and routes - */ - if (ORTE_PROC_MY_NAME->jobid == job) { - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } else { - /* if not, then I need to process the callback */ - if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - - return ORTE_SUCCESS; - } - - { /* MUST BE A PROC */ - /* if we are a singleton and have not yet exec'd our HNP, then - * just return success */ - if (ORTE_PROC_IS_SINGLETON && !orte_routing_is_enabled) { - return ORTE_SUCCESS; - } - - /* if ndat=NULL, then we are being called during orte_init. In this - * case, we need to setup a few critical pieces of info - */ - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, - "%s routed_radix: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, - (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); - - if (NULL == orte_process_info.my_daemon_uri) { - /* in this module, we absolutely MUST have this information - if - * we didn't get it, then error out - */ - opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: This is a fatal condition when the radix router", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: has been selected - either select the unity router", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_ERR_FATAL; - } - - /* we have to set the HNP's name, even though we won't route messages directly - * to it. This is required to ensure that we -do- send messages to the correct - * HNP name - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* extract the daemon's name so we can update the routing table */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, - ORTE_PROC_MY_DAEMON, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* Set the contact info in the RML - this won't actually establish - * the connection, but just tells the RML how to reach the daemon - * if/when we attempt to send to it - */ - orte_rml.set_contact_info(orte_process_info.my_daemon_uri); - - /* set our lifeline to the local daemon - we will abort if this connection is lost */ - lifeline = ORTE_PROC_MY_DAEMON; - - return ORTE_SUCCESS; - } -} - static int route_lost(const orte_process_name_t *route) { opal_list_item_t *item; @@ -678,16 +524,6 @@ static int radix_ft_event(int state) /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * Re-exchange the routes - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - exit_status = ret; - goto cleanup; - } - } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } diff --git a/orte/mca/routed/routed.h b/orte/mca/routed/routed.h index adf11160a1..cc8c800619 100644 --- a/orte/mca/routed/routed.h +++ b/orte/mca/routed/routed.h @@ -122,19 +122,6 @@ typedef int (*orte_routed_module_update_route_fn_t)(orte_process_name_t *target, */ typedef orte_process_name_t (*orte_routed_module_get_route_fn_t)(orte_process_name_t *target); -/** - * Initialize the routing table - * - * Initialize the routing table for the specified job. This can be rather complex - * and depends entirely upon both the selected module AND whether the function - * is being called by the HNP, an orted, a tool, or an application proc. To - * understand what is happening, you really need to look at the specific module. - * - * Regardless, at the end of the function, the routes to any other process in the - * specified job -must- be defined (even if it is direct) - */ -typedef int (*orte_routed_module_init_routes_fn_t)(orte_jobid_t job, opal_buffer_t *ndat); - /** * Report a route as "lost" * @@ -223,7 +210,6 @@ typedef struct { orte_routed_module_delete_route_fn_t delete_route; orte_routed_module_update_route_fn_t update_route; orte_routed_module_get_route_fn_t get_route; - orte_routed_module_init_routes_fn_t init_routes; orte_routed_module_route_lost_fn_t route_lost; orte_routed_module_route_is_defined_fn_t route_is_defined; orte_routed_module_set_lifeline_fn_t set_lifeline; @@ -246,8 +232,6 @@ typedef int (*orte_routed_API_update_route_fn_t)(char *module, orte_process_name_t *route); typedef orte_process_name_t (*orte_routed_API_get_route_fn_t)(char *module, orte_process_name_t *target); -typedef int (*orte_routed_API_init_routes_fn_t)(char *module, - orte_jobid_t job, opal_buffer_t *ndat); typedef int (*orte_routed_API_route_lost_fn_t)(char *module, const orte_process_name_t *route); typedef bool (*orte_routed_API_route_is_defined_fn_t)(char *module, @@ -265,7 +249,6 @@ typedef struct { orte_routed_API_delete_route_fn_t delete_route; orte_routed_API_update_route_fn_t update_route; orte_routed_API_get_route_fn_t get_route; - orte_routed_API_init_routes_fn_t init_routes; orte_routed_API_route_lost_fn_t route_lost; orte_routed_API_route_is_defined_fn_t route_is_defined; orte_routed_API_set_lifeline_fn_t set_lifeline;