From f54fda489e444c908c232d999fdf40658f27c0ca Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 31 Oct 2008 21:10:00 +0000 Subject: [PATCH] This is a first step towards supporting fully-routed OOB communications: 1. remove direct routed module (hooray!) 2. add radix tree routed module (binomial remains default) 3. remove duplicate data storage - orteds were storing nidmap and pidmap data in odls, everyone else in ess 4. add ess APIs to update nidmap, add new pidmap - used only by orteds for MPI-2 support 5. modify code to eliminate multiple calls to orte_routed.update_route that recreated info already in ess pidmap. Add ess API to lookup that info instead. Modify routed modules to utilize that capability 6. setup new ability to shutdown orteds without sending back an "ack" message to mpirun - not utilized yet, will require some changes to plm terminate_orteds functions in managed environments (coming soon) Initial tests indicating that fully routing comm via defined routing trees may not actually have a significant cost for operations like IB QP setup. More tests required to confirm. This will require an autogen... This commit was SVN r19866. --- orte/mca/ess/alps/ess_alps_module.c | 120 +- orte/mca/ess/base/base.h | 2 +- orte/mca/ess/base/ess_base_nidmap.c | 11 +- orte/mca/ess/base/ess_base_std_orted.c | 17 - orte/mca/ess/cnos/ess_cnos_module.c | 3 + orte/mca/ess/env/ess_env_module.c | 127 +- orte/mca/ess/ess.h | 33 + orte/mca/ess/hnp/ess_hnp_module.c | 50 + orte/mca/ess/lsf/ess_lsf_module.c | 124 +- .../portals_utcp/ess_portals_utcp_module.c | 3 + orte/mca/ess/singleton/ess_singleton_module.c | 57 +- orte/mca/ess/slurm/ess_slurm_module.c | 125 +- orte/mca/ess/tool/ess_tool_module.c | 10 +- orte/mca/odls/base/odls_base_close.c | 16 - orte/mca/odls/base/odls_base_default_fns.c | 166 ++- orte/mca/odls/base/odls_base_open.c | 8 - orte/mca/odls/base/odls_private.h | 3 +- orte/mca/odls/odls_types.h | 35 +- orte/mca/oob/tcp/oob_tcp_peer.c | 4 - orte/mca/plm/alps/plm_alps_module.c | 2 +- orte/mca/plm/base/plm_base_heartbeat.c | 2 +- orte/mca/plm/base/plm_base_launch_support.c | 73 +- orte/mca/plm/base/plm_base_orted_cmds.c | 20 +- orte/mca/plm/base/plm_private.h | 3 +- orte/mca/plm/ccp/plm_ccp_module.c | 2 +- orte/mca/plm/lsf/plm_lsf_module.c | 2 +- orte/mca/plm/process/plm_process_module.c | 2 +- orte/mca/plm/rsh/plm_rsh_module.c | 58 +- orte/mca/plm/slurm/plm_slurm_module.c | 2 +- orte/mca/plm/submit/pls_submit_module.c | 2 +- orte/mca/plm/tm/plm_tm_module.c | 2 +- orte/mca/plm/tmd/plm_tmd_module.c | 2 +- orte/mca/rml/base/rml_base_contact.c | 30 +- orte/mca/routed/base/base.h | 11 + orte/mca/routed/base/routed_base_components.c | 12 + orte/mca/routed/binomial/routed_binomial.c | 315 ++--- orte/mca/routed/direct/routed_direct.c | 988 --------------- .../routed/direct/routed_direct_component.c | 51 - orte/mca/routed/linear/routed_linear.c | 258 ++-- orte/mca/routed/{direct => radix}/Makefile.am | 20 +- .../routed/{direct => radix}/configure.params | 0 orte/mca/routed/radix/routed_radix.c | 1082 +++++++++++++++++ .../routed_direct.h => radix/routed_radix.h} | 14 +- .../mca/routed/radix/routed_radix_component.c | 75 ++ orte/orted/orted_comm.c | 83 +- orte/runtime/orte_globals.c | 3 +- orte/runtime/orte_globals.h | 7 +- orte/test/system/Makefile | 2 +- orte/test/system/radix.c | 139 +++ orte/util/nidmap.c | 79 +- orte/util/nidmap.h | 3 +- 51 files changed, 2319 insertions(+), 1939 deletions(-) delete mode 100644 orte/mca/routed/direct/routed_direct.c delete mode 100644 orte/mca/routed/direct/routed_direct_component.c rename orte/mca/routed/{direct => radix}/Makefile.am (57%) rename orte/mca/routed/{direct => radix}/configure.params (100%) create mode 100644 orte/mca/routed/radix/routed_radix.c rename orte/mca/routed/{direct/routed_direct.h => radix/routed_radix.h} (51%) create mode 100644 orte/mca/routed/radix/routed_radix_component.c create mode 100644 orte/test/system/radix.c diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index e31c022797..5edd9d7dab 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -40,28 +40,33 @@ static int alps_set_name(void); static int rte_init(char flags); static int rte_finalize(void); static bool proc_is_local(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc); static uint32_t proc_get_arch(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); orte_ess_base_module_t orte_ess_alps_module = { rte_init, rte_finalize, orte_ess_base_app_abort, proc_is_local, + proc_get_daemon, proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + add_pidmap, + update_nidmap, NULL /* ft_event */ }; static opal_pointer_array_t nidmap; static opal_pointer_array_t jobmap; -static orte_vpid_t nprocs; static int rte_init(char flags) @@ -95,6 +100,8 @@ static int rte_init(char flags) error = "orte_ess_base_tool_setup"; goto error; } + /* as a tool, I don't need a nidmap - so just return now */ + return ORTE_SUCCESS; } else { /* otherwise, I must be an application process - use * the default procedure to finish my setup @@ -104,24 +111,25 @@ static int rte_init(char flags) error = "orte_ess_base_app_setup"; goto error; } - - /* setup the nidmap arrays */ - OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); - opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); - - /* setup array of jmaps */ - OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); - opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); - jmap = OBJ_NEW(orte_jmap_t); - opal_pointer_array_add(&jobmap, jmap); - - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, - &nidmap, &jmap->pmap, &nprocs))) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_build_nidmap"; - goto error; - } + } + + /* setup the nidmap arrays */ + OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); + opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); + + /* setup array of jmaps */ + OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); + opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_add(&jobmap, jmap); + + /* if one was provided, build my nidmap */ + if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, + &nidmap, jmap))) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_build_nidmap"; + goto error; } return ORTE_SUCCESS; @@ -151,27 +159,30 @@ static int rte_finalize(void) if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { ORTE_ERROR_LOG(ret); } + /* as a tool, I didn't create a nidmap - so just return now */ + return ret; } else { - /* otherwise, I must be an application process - deconstruct - * my nidmap and jobmap arrays + /* otherwise, I must be an application process + * use the default procedure to finish */ - nids = (orte_nid_t**)nidmap.addr; - for (i=0; i < nidmap.size && NULL != nids[i]; i++) { - OBJ_RELEASE(nids[i]); - } - OBJ_DESTRUCT(&nidmap); - jmaps = (orte_jmap_t**)jobmap.addr; - for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { - OBJ_RELEASE(jmaps[i]); - } - OBJ_DESTRUCT(&jobmap); - - /* use the default procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); } } + + /* deconstruct my nidmap and jobmap arrays */ + nids = (orte_nid_t**)nidmap.addr; + for (i=0; i < nidmap.size && NULL != nids[i]; i++) { + OBJ_RELEASE(nids[i]); + } + OBJ_DESTRUCT(&nidmap); + jmaps = (orte_jmap_t**)jobmap.addr; + for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { + OBJ_RELEASE(jmaps[i]); + } + OBJ_DESTRUCT(&jobmap); + return ret; } @@ -201,6 +212,23 @@ static bool proc_is_local(orte_process_name_t *proc) } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_ess_base_lookup_nid(&nidmap, &jobmap, proc))) { + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:alps: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + static char* proc_get_hostname(orte_process_name_t *proc) { orte_nid_t *nid; @@ -293,6 +321,32 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) return pmap->node_rank; } +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo) +{ + orte_jmap_t *jmap; + int ret; + + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = job; + opal_pointer_array_add(&jobmap, jmap); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo, &jmap->num_procs, &jmap->pmap))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &nidmap))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} static int alps_set_name(void) { diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index 7cb7a0d205..6d2771fd45 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -85,7 +85,7 @@ ORTE_DECLSPEC int orte_ess_base_orted_finalize(void); */ ORTE_DECLSPEC int orte_ess_base_build_nidmap(opal_buffer_t *buffer, opal_pointer_array_t *nidmap, - opal_value_array_t *pmap, orte_vpid_t *num_procs); + orte_jmap_t *jmap); ORTE_DECLSPEC orte_pmap_t* orte_ess_base_lookup_pmap(opal_pointer_array_t *jobmap, orte_process_name_t *proc); diff --git a/orte/mca/ess/base/ess_base_nidmap.c b/orte/mca/ess/base/ess_base_nidmap.c index a1b0822b4d..8c19dc77c8 100644 --- a/orte/mca/ess/base/ess_base_nidmap.c +++ b/orte/mca/ess/base/ess_base_nidmap.c @@ -35,7 +35,7 @@ int orte_ess_base_build_nidmap(opal_buffer_t *buffer, opal_pointer_array_t *nidmap, - opal_value_array_t *pmap, orte_vpid_t *num_procs) + orte_jmap_t *jmap) { int rc; opal_byte_object_t *bo; @@ -71,8 +71,7 @@ int orte_ess_base_build_nidmap(opal_buffer_t *buffer, return rc; } /* unpack the process map */ - if (ORTE_SUCCESS != (rc = orte_util_decode_pidmap(bo, num_procs, - pmap, NULL, NULL))) { + if (ORTE_SUCCESS != (rc = orte_util_decode_pidmap(bo, &jmap->num_procs, &jmap->pmap))) { ORTE_ERROR_LOG(rc); return rc; } @@ -124,7 +123,6 @@ orte_nid_t* orte_ess_base_lookup_nid(opal_pointer_array_t *nidmap, opal_pointer_array_t *jobmap, orte_process_name_t *proc) { - orte_nid_t *nid; orte_nid_t **nids; orte_pmap_t *pmap; @@ -135,10 +133,7 @@ orte_nid_t* orte_ess_base_lookup_nid(opal_pointer_array_t *nidmap, return NULL; } /* looking for a daemon in my family */ - if (NULL == (nid = find_daemon_node(nidmap, proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } - return nid; + return find_daemon_node(nidmap, proc); } /* looking for an application proc */ diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index af397b0e2e..aecbf6ab39 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -272,23 +272,6 @@ error: int orte_ess_base_orted_finalize(void) { - opal_buffer_t ack; - orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED; - orte_exit_code_t exit_code=0; - orte_plm_cmd_flag_t cmd = ORTE_PLM_UPDATE_PROC_STATE; - - /* send a state update so the HNP knows we are "gone" */ - OBJ_CONSTRUCT(&ack, opal_buffer_t); - opal_dss.pack(&ack, &cmd, 1, ORTE_PLM_CMD); - opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID); - opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID); - opal_dss.pack(&ack, &state, 1, ORTE_PROC_STATE); - opal_dss.pack(&ack, &exit_code, 1, ORTE_EXIT_CODE); - orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0); - OBJ_DESTRUCT(&ack); - /* progress the OOB to ensure the message gets out */ - opal_progress(); - orte_notifier_base_close(); orte_cr_finalize(); diff --git a/orte/mca/ess/cnos/ess_cnos_module.c b/orte/mca/ess/cnos/ess_cnos_module.c index 89ba23b094..898c05a092 100644 --- a/orte/mca/ess/cnos/ess_cnos_module.c +++ b/orte/mca/ess/cnos/ess_cnos_module.c @@ -50,11 +50,14 @@ orte_ess_base_module_t orte_ess_cnos_module = { rte_finalize, rte_abort, proc_is_local, + NULL, /* proc_get_daemon is only used in ORTE */ proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + NULL, /* add_pidmap is only used in ORTE */ + NULL, /* update_nidmap is only used in ORTE */ NULL /* ft_event */ }; diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index 823db3c2b5..ef0874e7bb 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -66,6 +66,7 @@ #include "orte/util/session_dir.h" #include "orte/util/hnp_contact.h" #include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_wait.h" @@ -81,11 +82,14 @@ static int env_set_name(void); static int rte_init(char flags); static int rte_finalize(void); static bool proc_is_local(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc); static uint32_t proc_get_arch(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); #if OPAL_ENABLE_FT == 1 static int rte_ft_event(int state); @@ -97,11 +101,14 @@ orte_ess_base_module_t orte_ess_env_module = { rte_finalize, orte_ess_base_app_abort, proc_is_local, + proc_get_daemon, proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + add_pidmap, + update_nidmap, #if OPAL_ENABLE_FT == 1 rte_ft_event #else @@ -111,7 +118,6 @@ orte_ess_base_module_t orte_ess_env_module = { static opal_pointer_array_t nidmap; static opal_pointer_array_t jobmap; -static orte_vpid_t nprocs; static int rte_init(char flags) { @@ -145,6 +151,8 @@ static int rte_init(char flags) error = "orte_ess_base_tool_setup"; goto error; } + /* as a tool, I don't need a nidmap - so just return now */ + return ORTE_SUCCESS; } else { /* otherwise, I must be an application process - use @@ -155,27 +163,27 @@ static int rte_init(char flags) error = "orte_ess_base_app_setup"; goto error; } - - /* setup the nidmap arrays */ - OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); - opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); - - /* setup array of jmaps */ - OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); - opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); - jmap = OBJ_NEW(orte_jmap_t); - jmap->job = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_add(&jobmap, jmap); - - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, - &nidmap, &jmap->pmap, &nprocs))) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_build_nidmap"; - goto error; - } } - + + /* setup the nidmap arrays */ + OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); + opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); + + /* setup array of jmaps */ + OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); + opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_add(&jobmap, jmap); + + /* if one was provided, build my nidmap */ + if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, + &nidmap, jmap))) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_build_nidmap"; + goto error; + } + return ORTE_SUCCESS; error: @@ -203,27 +211,30 @@ static int rte_finalize(void) if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { ORTE_ERROR_LOG(ret); } + /* as a tool, I didn't create a nidmap - so just return now */ + return ret; } else { - /* otherwise, I must be an application process - deconstruct - * my nidmap and jobmap arrays + /* otherwise, I must be an application process + * use the default procedure to finish */ - nids = (orte_nid_t**)nidmap.addr; - for (i=0; i < nidmap.size && NULL != nids[i]; i++) { - OBJ_RELEASE(nids[i]); - } - OBJ_DESTRUCT(&nidmap); - jmaps = (orte_jmap_t**)jobmap.addr; - for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { - OBJ_RELEASE(jmaps[i]); - } - OBJ_DESTRUCT(&jobmap); - - /* use the default procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); } } + + /* deconstruct my nidmap and jobmap arrays */ + nids = (orte_nid_t**)nidmap.addr; + for (i=0; i < nidmap.size && NULL != nids[i]; i++) { + OBJ_RELEASE(nids[i]); + } + OBJ_DESTRUCT(&nidmap); + jmaps = (orte_jmap_t**)jobmap.addr; + for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { + OBJ_RELEASE(jmaps[i]); + } + OBJ_DESTRUCT(&jobmap); + return ret; } @@ -253,6 +264,23 @@ static bool proc_is_local(orte_process_name_t *proc) } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_ess_base_lookup_nid(&nidmap, &jobmap, proc))) { + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:env: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + static char* proc_get_hostname(orte_process_name_t *proc) { orte_nid_t *nid; @@ -345,6 +373,33 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) return pmap->node_rank; } +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo) +{ + orte_jmap_t *jmap; + int ret; + + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = job; + opal_pointer_array_add(&jobmap, jmap); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo, &jmap->num_procs, &jmap->pmap))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &nidmap))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + static int env_set_name(void) { char *jobid_str, *procid_str; @@ -602,7 +657,7 @@ static int rte_ft_event(int state) /* if one was provided, build my nidmap */ if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, - &nidmap, &jmap->pmap, &nprocs))) { + &nidmap, &jmap->pmap))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; diff --git a/orte/mca/ess/ess.h b/orte/mca/ess/ess.h index 043656f6bb..7f06ea2b11 100644 --- a/orte/mca/ess/ess.h +++ b/orte/mca/ess/ess.h @@ -70,6 +70,16 @@ typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report) __opal_ */ typedef bool (*orte_ess_base_module_proc_is_local_fn_t)(orte_process_name_t *proc); +/** + * Get the vpid of the daemon who hosts the specified proc + * + * In order to route messages to the correct place, the RML + * and routed modules need to know the vpid of the daemon + * that hosts the intended recipient. This API accesses + * the pidmap/nidmap to retrieve that info + */ +typedef orte_vpid_t (*orte_ess_base_module_proc_get_daemon_fn_t)(orte_process_name_t *proc); + /** * Get the hostname where a proc resides * @@ -110,7 +120,27 @@ typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_pr */ typedef int (*orte_ess_base_module_update_arch_fn_t)(orte_process_name_t *proc, uint32_t arch); +/** + * Add a pidmap + * + * When a job is dynamically launched via comm_spawn, the pre-existing daemons need to + * update their knowledge of the process map within the job so they can properly do + * things like route messages. This API allows daemons - and anyone else who wants to - to + * add a pidmap for a new job + */ +typedef int (*orte_ess_base_module_add_pidmap_fn_t)(orte_jobid_t job, opal_byte_object_t *bo); +/** + * Update a nidmap + * + * When a job is dynamically launched via comm_spawn, the pre-existing daemons need to + * update their knowledge of the node map that contains info on what daemon resides + * on which nodes + */ +typedef int (*orte_ess_base_module_update_nidmap_fn_t)(opal_byte_object_t *bo); + + + /** * Handle fault tolerance updates * @@ -129,11 +159,14 @@ struct orte_ess_base_module_1_0_0_t { orte_ess_base_module_finalize_fn_t finalize; orte_ess_base_module_abort_fn_t abort; orte_ess_base_module_proc_is_local_fn_t proc_is_local; + orte_ess_base_module_proc_get_daemon_fn_t proc_get_daemon; orte_ess_base_module_proc_get_hostname_fn_t proc_get_hostname; orte_ess_base_module_proc_get_arch_fn_t proc_get_arch; orte_ess_base_module_proc_get_local_rank_fn_t get_local_rank; orte_ess_base_module_proc_get_node_rank_fn_t get_node_rank; orte_ess_base_module_update_arch_fn_t update_arch; + orte_ess_base_module_add_pidmap_fn_t add_pidmap; + orte_ess_base_module_update_nidmap_fn_t update_nidmap; orte_ess_base_module_ft_event_fn_t ft_event; }; typedef struct orte_ess_base_module_1_0_0_t orte_ess_base_module_1_0_0_t; diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 7119d3e79e..502089c005 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -74,11 +74,14 @@ static int rte_init(char flags); static int rte_finalize(void); static void rte_abort(int status, bool report) __opal_attribute_noreturn__; static bool proc_is_local(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc); static uint32_t proc_get_arch(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); orte_ess_base_module_t orte_ess_hnp_module = { @@ -86,11 +89,14 @@ orte_ess_base_module_t orte_ess_hnp_module = { rte_finalize, rte_abort, proc_is_local, + proc_get_daemon, proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + add_pidmap, + update_nidmap, NULL /* ft_event */ }; @@ -564,6 +570,24 @@ static orte_proc_t* find_proc(orte_process_name_t *proc) } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_proc_t *pdata; + + /* get the job data */ + if (NULL == (pdata = find_proc(proc))) { + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:env: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(pdata->node->daemon->name.vpid))); + + return pdata->node->daemon->name.vpid; +} + static char* proc_get_hostname(orte_process_name_t *proc) { orte_proc_t *pdata; @@ -655,3 +679,29 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) return pdata->node_rank; } + +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo) +{ + /* there is nothing to do here - the HNP can resolve + * all requests directly from its internal data. However, + * we do need to free the data in the byte object to + * be consistent with other modules + */ + if (NULL != bo && NULL != bo->bytes) { + free(bo->bytes); + } + return ORTE_SUCCESS; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + /* there is nothing to do here - the HNP can resolve + * all requests directly from its internal data. However, + * we do need to free the data in the byte object to + * be consistent with other modules + */ + if (NULL != bo && NULL != bo->bytes) { + free(bo->bytes); + } + return ORTE_SUCCESS; +} diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index 8335ecf839..496b5a7e6c 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -50,28 +50,33 @@ static int lsf_set_name(void); static int rte_init(char flags); static int rte_finalize(void); static bool proc_is_local(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc); static uint32_t proc_get_arch(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); orte_ess_base_module_t orte_ess_lsf_module = { rte_init, rte_finalize, orte_ess_base_app_abort, proc_is_local, + proc_get_daemon, proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + add_pidmap, + update_nidmap, NULL /* ft_event */ }; static opal_pointer_array_t nidmap; static opal_pointer_array_t jobmap; -static orte_vpid_t nprocs; static int rte_init(char flags) @@ -105,6 +110,9 @@ static int rte_init(char flags) error = "orte_ess_base_tool_setup"; goto error; } + /* as a tool, I don't need a nidmap - so just return now */ + return ORTE_SUCCESS; + } else { /* otherwise, I must be an application process - use * the default procedure to finish my setup @@ -114,26 +122,25 @@ static int rte_init(char flags) error = "orte_ess_base_app_setup"; goto error; } - - /* setup the nidmap arrays */ - OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); - opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); - - /* setup array of jmaps */ - OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); - opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); - jmap = OBJ_NEW(orte_jmap_t); - jmap->job = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_add(&jobmap, jmap); - - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, - &nidmap, &jmap->pmap, &nprocs))) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_build_nidmap"; - goto error; - } - + } + + /* setup the nidmap arrays */ + OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); + opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); + + /* setup array of jmaps */ + OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); + opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_add(&jobmap, jmap); + + /* if one was provided, build my nidmap */ + if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, + &nidmap, jmap))) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_build_nidmap"; + goto error; } return ORTE_SUCCESS; @@ -163,27 +170,30 @@ static int rte_finalize(void) if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { ORTE_ERROR_LOG(ret); } + /* as a tool, I didn't create a nidmap - so just return now */ + return ret; } else { - /* otherwise, I must be an application process - deconstruct - * my nidmap and jobmap arrays + /* otherwise, I must be an application process + * use the default procedure to finish */ - nids = (orte_nid_t**)nidmap.addr; - for (i=0; i < nidmap.size && NULL != nids[i]; i++) { - OBJ_RELEASE(nids[i]); - } - OBJ_DESTRUCT(&nidmap); - jmaps = (orte_jmap_t**)jobmap.addr; - for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { - OBJ_RELEASE(jmaps[i]); - } - OBJ_DESTRUCT(&jobmap); - - /* use the default procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); } } + + /* deconstruct my nidmap and jobmap arrays */ + nids = (orte_nid_t**)nidmap.addr; + for (i=0; i < nidmap.size && NULL != nids[i]; i++) { + OBJ_RELEASE(nids[i]); + } + OBJ_DESTRUCT(&nidmap); + jmaps = (orte_jmap_t**)jobmap.addr; + for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { + OBJ_RELEASE(jmaps[i]); + } + OBJ_DESTRUCT(&jobmap); + return ret; } @@ -213,6 +223,23 @@ static bool proc_is_local(orte_process_name_t *proc) } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_ess_base_lookup_nid(&nidmap, &jobmap, proc))) { + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:lsf: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + static char* proc_get_hostname(orte_process_name_t *proc) { orte_nid_t *nid; @@ -305,6 +332,33 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) return pmap->node_rank; } +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo) +{ + orte_jmap_t *jmap; + int ret; + + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = job; + opal_pointer_array_add(&jobmap, jmap); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo, &jmap->num_procs, &jmap->pmap))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &nidmap))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + static int lsf_set_name(void) { diff --git a/orte/mca/ess/portals_utcp/ess_portals_utcp_module.c b/orte/mca/ess/portals_utcp/ess_portals_utcp_module.c index 12508e2483..58a1d36712 100644 --- a/orte/mca/ess/portals_utcp/ess_portals_utcp_module.c +++ b/orte/mca/ess/portals_utcp/ess_portals_utcp_module.c @@ -50,11 +50,14 @@ orte_ess_base_module_t orte_ess_portals_utcp_module = { rte_finalize, rte_abort, proc_is_local, + NULL, /* proc_get_daemon is only used in ORTE */ proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + NULL, /* add_pidmap is only used in ORTE */ + NULL, /* update_nidmap is only used in ORTE */ NULL /* ft_event */ }; diff --git a/orte/mca/ess/singleton/ess_singleton_module.c b/orte/mca/ess/singleton/ess_singleton_module.c index dcc6c30417..9849f15f4e 100644 --- a/orte/mca/ess/singleton/ess_singleton_module.c +++ b/orte/mca/ess/singleton/ess_singleton_module.c @@ -45,6 +45,7 @@ #include "orte/mca/routed/routed.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/nidmap.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/base/base.h" @@ -68,22 +69,28 @@ static void set_handler_default(int sig) static int rte_init(char flags); static int rte_finalize(void); static bool proc_is_local(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc); static uint32_t proc_get_arch(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); orte_ess_base_module_t orte_ess_singleton_module = { rte_init, rte_finalize, orte_ess_base_app_abort, proc_is_local, + proc_get_daemon, proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + add_pidmap, + update_nidmap, NULL /* ft_event */ }; @@ -181,6 +188,7 @@ static int rte_init(char flags) pmap.node_rank = 0; pmap.node = 0; opal_value_array_set_item(&jmap->pmap, 0, &pmap); + jmap->num_procs = 1; /* use the std app init to complete the procedure */ if (ORTE_SUCCESS != (rc = orte_ess_base_app_setup())) { @@ -430,14 +438,14 @@ static bool proc_is_local(orte_process_name_t *proc) if (nid->daemon == ORTE_PROC_MY_DAEMON->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:env: proc %s is LOCAL", + "%s ess:singleton: proc %s is LOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); return true; } OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:env: proc %s is REMOTE", + "%s ess:singleton: proc %s is REMOTE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); @@ -445,6 +453,23 @@ static bool proc_is_local(orte_process_name_t *proc) } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_ess_base_lookup_nid(&nidmap, &jobmap, proc))) { + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:singleton: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + static char* proc_get_hostname(orte_process_name_t *proc) { orte_nid_t *nid; @@ -536,3 +561,31 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) return pmap->node_rank; } + +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo) +{ + orte_jmap_t *jmap; + int ret; + + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = job; + opal_pointer_array_add(&jobmap, jmap); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo, &jmap->num_procs, &jmap->pmap))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &nidmap))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 4e1f7183a1..e5abbb47cc 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -39,6 +39,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/nidmap.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/base/base.h" @@ -50,30 +51,34 @@ static int slurm_set_name(void); static int rte_init(char flags); static int rte_finalize(void); static bool proc_is_local(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc); static uint32_t proc_get_arch(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_arch(orte_process_name_t *proc, uint32_t arch); - +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); orte_ess_base_module_t orte_ess_slurm_module = { rte_init, rte_finalize, orte_ess_base_app_abort, proc_is_local, + proc_get_daemon, proc_get_hostname, proc_get_arch, proc_get_local_rank, proc_get_node_rank, update_arch, + add_pidmap, + update_nidmap, NULL /* ft_event */ }; static opal_pointer_array_t nidmap; static opal_pointer_array_t jobmap; -static orte_vpid_t nprocs; static int rte_init(char flags) { @@ -106,6 +111,9 @@ static int rte_init(char flags) error = "orte_ess_base_tool_setup"; goto error; } + /* as a tool, I don't need a nidmap - so just return now */ + return ORTE_SUCCESS; + } else { /* otherwise, I must be an application process - use * the default procedure to finish my setup @@ -115,25 +123,25 @@ static int rte_init(char flags) error = "orte_ess_base_app_setup"; goto error; } - - /* setup the nidmap arrays */ - OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); - opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); - - /* setup array of jmaps */ - OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); - opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); - jmap = OBJ_NEW(orte_jmap_t); - jmap->job = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_add(&jobmap, jmap); - - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, - &nidmap, &jmap->pmap, &nprocs))) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_build_nidmap"; - goto error; - } + } + + /* setup the nidmap arrays */ + OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t); + opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8); + + /* setup array of jmaps */ + OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t); + opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1); + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_add(&jobmap, jmap); + + /* if one was provided, build my nidmap */ + if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf, + &nidmap, jmap))) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_build_nidmap"; + goto error; } return ORTE_SUCCESS; @@ -163,27 +171,30 @@ static int rte_finalize(void) if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { ORTE_ERROR_LOG(ret); } + /* as a tool, I didn't create a nidmap - so just return now */ + return ret; } else { - /* otherwise, I must be an application process - deconstruct - * my nidmap and jobmap arrays + /* otherwise, I must be an application process + * use the default procedure to finish */ - nids = (orte_nid_t**)nidmap.addr; - for (i=0; i < nidmap.size && NULL != nids[i]; i++) { - OBJ_RELEASE(nids[i]); - } - OBJ_DESTRUCT(&nidmap); - jmaps = (orte_jmap_t**)jobmap.addr; - for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { - OBJ_RELEASE(jmaps[i]); - } - OBJ_DESTRUCT(&jobmap); - - /* use the default procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); } } + + /* deconstruct my nidmap and jobmap arrays */ + nids = (orte_nid_t**)nidmap.addr; + for (i=0; i < nidmap.size && NULL != nids[i]; i++) { + OBJ_RELEASE(nids[i]); + } + OBJ_DESTRUCT(&nidmap); + jmaps = (orte_jmap_t**)jobmap.addr; + for (i=0; i < jobmap.size && NULL != jmaps[i]; i++) { + OBJ_RELEASE(jmaps[i]); + } + OBJ_DESTRUCT(&jobmap); + return ret; } @@ -213,6 +224,23 @@ static bool proc_is_local(orte_process_name_t *proc) } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_ess_base_lookup_nid(&nidmap, &jobmap, proc))) { + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurm: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + static char* proc_get_hostname(orte_process_name_t *proc) { orte_nid_t *nid; @@ -305,6 +333,33 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) return pmap->node_rank; } +static int add_pidmap(orte_jobid_t job, opal_byte_object_t *bo) +{ + orte_jmap_t *jmap; + int ret; + + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = job; + opal_pointer_array_add(&jobmap, jmap); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo, &jmap->num_procs, &jmap->pmap))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &nidmap))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + static int slurm_set_name(void) { int slurm_nodeid; diff --git a/orte/mca/ess/tool/ess_tool_module.c b/orte/mca/ess/tool/ess_tool_module.c index ed453981c1..a98e6c3247 100644 --- a/orte/mca/ess/tool/ess_tool_module.c +++ b/orte/mca/ess/tool/ess_tool_module.c @@ -44,6 +44,7 @@ static int rte_init(char flags); static void rte_abort(int status, bool report) __opal_attribute_noreturn__; +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); orte_ess_base_module_t orte_ess_tool_module = { @@ -51,11 +52,14 @@ orte_ess_base_module_t orte_ess_tool_module = { orte_ess_base_tool_finalize, rte_abort, NULL, /* don't need a local procs fn */ + proc_get_daemon, NULL, /* don't need a proc_get_hostname fn */ NULL, /* don't need a proc_get_arch fn */ NULL, /* don't need a proc_get_local_rank fn */ NULL, /* don't need a proc_get_node_rank fn */ - NULL, /* don't need to update_nidmap */ + NULL, /* don't need to update_arch */ + NULL, /* don't need to add_pidmap */ + NULL, /* don't need to update_nidmap */ NULL /* ft_event */ }; @@ -148,3 +152,7 @@ static void rte_abort(int status, bool report) abort(); } +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + return ORTE_VPID_INVALID; +} diff --git a/orte/mca/odls/base/odls_base_close.c b/orte/mca/odls/base/odls_base_close.c index 33e01cd081..86d69a8c52 100644 --- a/orte/mca/odls/base/odls_base_close.c +++ b/orte/mca/odls/base/odls_base_close.c @@ -21,12 +21,8 @@ #include -#include "opal/util/trace.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/runtime/orte_globals.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -35,11 +31,6 @@ int orte_odls_base_close(void) { - int i; - char **nodes; - - OPAL_TRACE(5); - /* cleanup globals */ OBJ_DESTRUCT(&orte_odls_globals.mutex); OBJ_DESTRUCT(&orte_odls_globals.cond); @@ -49,13 +40,6 @@ int orte_odls_base_close(void) free(orte_odls_globals.dmap->bytes); free(orte_odls_globals.dmap); } - nodes = (char**)orte_daemonmap.addr; - for (i=0; i < orte_daemonmap.size; i++) { - if (NULL != nodes[i]) { - free(nodes[i]); - } - } - OBJ_DESTRUCT(&orte_daemonmap); /* if no components are available, then punt */ if (!orte_odls_base.components_available) { diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 530aa9daf9..f95df0fa92 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -71,9 +71,6 @@ #include "orte/mca/odls/base/odls_private.h" -static int8_t *app_idx; -static char **slot_str=NULL; - /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW */ @@ -82,17 +79,21 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, { int rc; orte_job_t *jdata; + orte_proc_t **procs; orte_job_map_t *map; opal_buffer_t *wireup; opal_byte_object_t bo, *boptr; int32_t numbytes; int8_t flag; + int8_t *tmp; + orte_vpid_t i; /* get the job data pointer */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } + procs = (orte_proc_t**)jdata->procs->addr; /* get a pointer to the job map */ map = jdata->map; @@ -190,6 +191,12 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return rc; } + /* pack the number of procs in this launch */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_procs, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the total slots allocated to us */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->total_slots_alloc, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); @@ -237,6 +244,38 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, /* release the data since it has now been copied into our buffer */ free(bo.bytes); + /* transfer and pack the app_idx array for this job in one pack */ + tmp = (int8_t*)malloc(jdata->num_procs); + for (i=0; i < jdata->num_procs; i++) { + tmp[i] = procs[i]->app_idx; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, tmp, jdata->num_procs, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + free(tmp); + + /* are there cpu_list strings? */ + if (jdata->map->cpu_lists) { + flag = (int8_t)true; + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + for (i=0; i < jdata->num_procs; i++) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &procs[i]->slot_list, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } else { + flag = (int8_t)false; + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + return ORTE_SUCCESS; } @@ -244,7 +283,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_jobid_t *job) { int rc, ret; - orte_vpid_t j; + orte_vpid_t j, host_daemon; orte_odls_child_t *child; orte_std_cntr_t cnt; orte_process_name_t proc, daemon; @@ -252,13 +291,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, opal_buffer_t wireup; opal_byte_object_t *bo; int32_t numbytes; - orte_nid_t *node; - orte_pmap_t *pmap; opal_buffer_t alert; opal_list_item_t *item; orte_namelist_t *nm; opal_list_t daemon_tree; int8_t flag; + int8_t *app_idx; + char **slot_str=NULL; orte_jobid_t debugger; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, @@ -283,10 +322,14 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, /* retain a copy for downloading to child processes */ opal_dss.copy((void**)&orte_odls_globals.dmap, bo, OPAL_BYTE_OBJECT); - /* construct the daemon map, if required - the decode function + /* update our local nidmap, if required - the decode function * knows what to do - it will also free the bytes in the bo */ - if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &orte_daemonmap))) { + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:construct updating nidmap", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } @@ -408,6 +451,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, } /* UNPACK JOB-SPECIFIC DATA */ + /* unpack the number of procs in this launch */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_procs, &cnt, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } /* unpack the total slots allocated to us */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->total_slots_alloc, &cnt, ORTE_STD_CNTR))) { @@ -458,36 +507,62 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, /* retain a copy for downloading to child processes */ opal_dss.copy((void**)&jobdat->pmap, bo, OPAL_BYTE_OBJECT); /* decode the pidmap - this will also free the bytes in bo */ - if (ORTE_SUCCESS != (rc = orte_util_decode_pidmap(bo, &jobdat->num_procs, &jobdat->procmap, &app_idx, &slot_str))) { + if (ORTE_SUCCESS != (rc = orte_ess.add_pidmap(jobdat->jobid, bo))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } + /* allocate memory for app_idx */ + app_idx = (int8_t*)malloc(jobdat->num_procs); + /* unpack app_idx in one shot */ + cnt=jobdat->num_procs; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, app_idx, &cnt, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + + /* unpack flag to indicate if slot_strings are present */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + + if (flag) { + /* allocate space */ + slot_str = (char**)malloc(jobdat->num_procs * sizeof(char*)); + for (j=0; j < jobdat->num_procs; j++) { + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &slot_str[j], &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + } + /* get the daemon tree */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); orte_routed.get_routing_tree(ORTE_PROC_MY_NAME->jobid, &daemon_tree); /* cycle through the procs and find mine */ - proc.jobid = *job; + proc.jobid = jobdat->jobid; daemon.jobid = ORTE_PROC_MY_NAME->jobid; for (j=0; j < jobdat->num_procs; j++) { proc.vpid = j; - /* ident this proc's node */ - pmap = (orte_pmap_t *) opal_value_array_get_item(&jobdat->procmap, j); - if (pmap->node < 0 || pmap->node >= orte_daemonmap.size) { - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; + /* get the vpid of the daemon that is to host this proc */ + if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; goto REPORT_ERROR; } - node = (orte_nid_t*)orte_daemonmap.addr[pmap->node]; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:constructing child list - checking proc %s on node %d with daemon %s", + "%s odls:constructing child list - checking proc %s on daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(j), - pmap->node, ORTE_VPID_PRINT(node->daemon))); + ORTE_VPID_PRINT(host_daemon))); - /* does this data belong to us? */ - if (ORTE_PROC_MY_NAME->vpid == node->daemon) { + /* does this proc belong to us? */ + if (ORTE_PROC_MY_NAME->vpid == host_daemon) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:constructing child list - found proc %s for me!", @@ -511,13 +586,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, opal_list_append(&orte_odls_globals.children, &child->super); opal_condition_signal(&orte_odls_globals.cond); OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - /* set the routing info to be direct - we need to do this - * prior to launch as the procs may want to communicate right away - */ - if (ORTE_SUCCESS != (rc = orte_routed.update_route(&proc, &proc))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } } else { /* is this proc on a daemon in a branch of the daemon tree * that is below me? If so, then the daemon collective will @@ -527,7 +595,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, item != opal_list_get_end(&daemon_tree); item = opal_list_get_next(item)) { nm = (orte_namelist_t*)item; - if (orte_routed.proc_is_below(nm->name.vpid, node->daemon)) { + if (orte_routed.proc_is_below(nm->name.vpid, host_daemon)) { /* add to the count for collectives */ jobdat->num_participating++; /* remove this node from the tree so we don't count it again */ @@ -536,15 +604,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, break; } } - - /* set the routing info through the other daemon - we need to do this - * prior to launch as the procs may want to communicate right away - */ - daemon.vpid = node->daemon; - if (ORTE_SUCCESS != (rc = orte_routed.update_route(&proc, &daemon))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } } } @@ -865,7 +924,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_buffer_t alert; orte_std_cntr_t proc_rank; orte_odls_job_t *jobdat; - orte_pmap_t *pmap; + orte_local_rank_t local_rank; /* protect operations involving the global list of children */ OPAL_THREAD_LOCK(&orte_odls_globals.mutex); @@ -1075,8 +1134,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - pmap = (orte_pmap_t*)opal_value_array_get_item(&jobdat->procmap, child->name->vpid); - asprintf(&value, "%lu", (unsigned long) pmap->local_rank); + local_rank = orte_ess.get_local_rank(child->name); + asprintf(&value, "%lu", (unsigned long) local_rank); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); @@ -1387,23 +1446,21 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) static void setup_singleton_jobdat(orte_jobid_t jobid) { orte_odls_job_t *jobdat; - orte_pmap_t pmap; int32_t one32; int8_t one8; orte_local_rank_t lrank; orte_node_rank_t nrank; opal_buffer_t buffer; + opal_byte_object_t *bo; int rc; + /* create a job tracking object for it */ jobdat = OBJ_NEW(orte_odls_job_t); jobdat->jobid = jobid; jobdat->num_procs = 1; jobdat->num_local_procs = 1; - pmap.node = 0; /* since it is a singleton, it must be on the first node in array */ - pmap.local_rank = 0; - pmap.node_rank = opal_list_get_size(&orte_odls_globals.children); - opal_value_array_set_item(&jobdat->procmap, 0, &pmap); - /* also need to setup a pidmap for it */ + opal_list_append(&orte_odls_globals.jobs, &jobdat->super); + /* need to setup a pidmap for it */ OBJ_CONSTRUCT(&buffer, opal_buffer_t); opal_dss.pack(&buffer, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID); /* num_procs */ one32 = 0; @@ -1414,10 +1471,14 @@ static void setup_singleton_jobdat(orte_jobid_t jobid) opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */ one8 = 0; opal_dss.pack(&buffer, &one8, 1, OPAL_INT8); /* app_idx */ - jobdat->pmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); - opal_dss.unload(&buffer, (void**)&jobdat->pmap->bytes, &jobdat->pmap->size); + opal_dss.unload(&buffer, (void**)&bo->bytes, &bo->size); OBJ_DESTRUCT(&buffer); - opal_list_append(&orte_odls_globals.jobs, &jobdat->super); + /* save a copy to send back to the proc */ + opal_dss.copy((void**)&jobdat->pmap, bo, OPAL_BYTE_OBJECT); + /* update our ess data - this will release the byte object's data */ + if (ORTE_SUCCESS != (rc = orte_ess.add_pidmap(jobid, bo))) { + ORTE_ERROR_LOG(rc); + } /* if we don't yet have a daemon map, then we have to generate one * to pass back to it */ @@ -1427,6 +1488,13 @@ static void setup_singleton_jobdat(orte_jobid_t jobid) if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(orte_odls_globals.dmap))) { ORTE_ERROR_LOG(rc); } + /* we also need to update our local nidmap - copy the dmap + * as this will release the byte object's data + */ + opal_dss.copy((void**)&bo, orte_odls_globals.dmap, OPAL_BYTE_OBJECT); + if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { + ORTE_ERROR_LOG(rc); + } } /* setup the daemon collectives */ jobdat->num_participating = 1; diff --git a/orte/mca/odls/base/odls_base_open.c b/orte/mca/odls/base/odls_base_open.c index b1578219f6..23835e6600 100644 --- a/orte/mca/odls/base/odls_base_open.c +++ b/orte/mca/odls/base/odls_base_open.c @@ -106,8 +106,6 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr) ptr->total_slots_alloc = 0; ptr->num_procs = 0; ptr->num_local_procs = 0; - OBJ_CONSTRUCT(&ptr->procmap, opal_value_array_t); - opal_value_array_init(&ptr->procmap, sizeof(orte_pmap_t)); ptr->pmap = NULL; OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t); OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t); @@ -129,8 +127,6 @@ static void orte_odls_job_destructor(orte_odls_job_t *ptr) } } - OBJ_DESTRUCT(&ptr->procmap); - if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) { free(ptr->pmap->bytes); free(ptr->pmap); @@ -173,10 +169,6 @@ int orte_odls_base_open(void) orte_odls_globals.debugger = NULL; orte_odls_globals.debugger_launched = false; - /* initialize and setup the daemonmap */ - OBJ_CONSTRUCT(&orte_daemonmap, opal_pointer_array_t); - opal_pointer_array_init(&orte_daemonmap, 8, INT32_MAX, 8); - /* Open up all available components */ if (ORTE_SUCCESS != diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h index aea4d3b78c..0f51fece4a 100644 --- a/orte/mca/odls/base/odls_private.h +++ b/orte/mca/odls/base/odls_private.h @@ -80,8 +80,7 @@ typedef struct orte_odls_job_t { orte_std_cntr_t total_slots_alloc; orte_vpid_t num_procs; int32_t num_local_procs; - opal_value_array_t procmap; /* map of procs/node, local ranks */ - opal_byte_object_t *pmap; /* byte object version of procmap */ + opal_byte_object_t *pmap; /* local copy of pidmap byte object */ opal_buffer_t collection_bucket; opal_buffer_t local_collection; orte_grpcomm_coll_t collective_type; diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index a767469889..0160d3034d 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -38,27 +38,28 @@ typedef uint8_t orte_daemon_cmd_flag_t; #define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 4 #define ORTE_DAEMON_TREE_SPAWN (orte_daemon_cmd_flag_t) 5 #define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 6 -#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 7 -#define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 8 -#define ORTE_DAEMON_MESSAGE_LOCAL_PROCS (orte_daemon_cmd_flag_t) 9 -#define ORTE_DAEMON_NULL_CMD (orte_daemon_cmd_flag_t) 10 -#define ORTE_DAEMON_SYNC_BY_PROC (orte_daemon_cmd_flag_t) 11 -#define ORTE_DAEMON_SYNC_WANT_NIDMAP (orte_daemon_cmd_flag_t) 12 +#define ORTE_DAEMON_EXIT_WITH_REPLY_CMD (orte_daemon_cmd_flag_t) 7 +#define ORTE_DAEMON_EXIT_NO_REPLY_CMD (orte_daemon_cmd_flag_t) 8 +#define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 9 +#define ORTE_DAEMON_MESSAGE_LOCAL_PROCS (orte_daemon_cmd_flag_t) 10 +#define ORTE_DAEMON_NULL_CMD (orte_daemon_cmd_flag_t) 11 +#define ORTE_DAEMON_SYNC_BY_PROC (orte_daemon_cmd_flag_t) 12 +#define ORTE_DAEMON_SYNC_WANT_NIDMAP (orte_daemon_cmd_flag_t) 13 /* commands for use by tools */ -#define ORTE_DAEMON_REPORT_JOB_INFO_CMD (orte_daemon_cmd_flag_t) 13 -#define ORTE_DAEMON_REPORT_NODE_INFO_CMD (orte_daemon_cmd_flag_t) 14 -#define ORTE_DAEMON_REPORT_PROC_INFO_CMD (orte_daemon_cmd_flag_t) 15 -#define ORTE_DAEMON_ATTACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 16 -#define ORTE_DAEMON_ATTACH_STDERR_CMD (orte_daemon_cmd_flag_t) 17 -#define ORTE_DAEMON_DETACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 18 -#define ORTE_DAEMON_DETACH_STDERR_CMD (orte_daemon_cmd_flag_t) 19 -#define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 20 -#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 21 -#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 22 +#define ORTE_DAEMON_REPORT_JOB_INFO_CMD (orte_daemon_cmd_flag_t) 14 +#define ORTE_DAEMON_REPORT_NODE_INFO_CMD (orte_daemon_cmd_flag_t) 15 +#define ORTE_DAEMON_REPORT_PROC_INFO_CMD (orte_daemon_cmd_flag_t) 16 +#define ORTE_DAEMON_ATTACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 17 +#define ORTE_DAEMON_ATTACH_STDERR_CMD (orte_daemon_cmd_flag_t) 18 +#define ORTE_DAEMON_DETACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 19 +#define ORTE_DAEMON_DETACH_STDERR_CMD (orte_daemon_cmd_flag_t) 20 +#define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 21 +#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 22 +#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 23 /* collective-based cmds */ -#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 23 +#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 24 END_C_DECLS diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 784a72a318..e66867e267 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -563,10 +563,6 @@ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd) peer->peer_state = MCA_OOB_TCP_CONNECTED; peer->peer_retries = 0; - /* Since we have a direct connection established to this peer, use - the connection as a direct route between peers */ - orte_routed.update_route(&peer->peer_name, &peer->peer_name); - if(opal_list_get_size(&peer->peer_send_queue) > 0) { if(NULL == peer->peer_send_msg) { peer->peer_send_msg = (mca_oob_tcp_msg_t*) diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 9447801c16..b0e73229eb 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -442,7 +442,7 @@ static int plm_alps_terminate_orteds(void) orte_wait_cb_cancel(alps_pid); /* tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/base/plm_base_heartbeat.c b/orte/mca/plm/base/plm_base_heartbeat.c index 75584d4039..01667f5740 100644 --- a/orte/mca/plm/base/plm_base_heartbeat.c +++ b/orte/mca/plm/base/plm_base_heartbeat.c @@ -89,7 +89,7 @@ static void check_heartbeat(int fd, short dummy, void *arg) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are aborting or shutting down, ignore this */ - if (orte_abnormal_term_ordered || orte_shutdown_in_progress) { + if (orte_abnormal_term_ordered || 0 == orte_heartbeat_rate) { return; } diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 3ba2b876b3..c59b5802b8 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -58,6 +58,7 @@ static int orte_plm_base_report_launched(orte_jobid_t job); int orte_plm_base_setup_job(orte_job_t *jdata) { + orte_job_t *jdatorted; int rc; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, @@ -87,27 +88,28 @@ int orte_plm_base_setup_job(orte_job_t *jdata) opal_byte_object_t bo; int i; orte_nid_t **nodes; + opal_pointer_array_t dummy; /* construct a nodemap */ if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) { ORTE_ERROR_LOG(rc); return rc; } - /* construct the daemon map, if required - the decode function - * knows what to do - */ - if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo, &orte_daemonmap))) { + OBJ_CONSTRUCT(&dummy, opal_pointer_array_t); + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo, &dummy))) { ORTE_ERROR_LOG(rc); return rc; } /* print-out the map */ - nodes = (orte_nid_t**)orte_daemonmap.addr; - for (i=0; i < orte_daemonmap.size; i++) { + nodes = (orte_nid_t**)dummy.addr; + for (i=0; i < dummy.size; i++) { if (NULL != nodes[i]) { fprintf(stderr, "NIDMAP: name %s daemon %s arch %0x\n", nodes[i]->name, ORTE_VPID_PRINT(nodes[i]->daemon), nodes[i]->arch); + OBJ_RELEASE(nodes[i]); } } + OBJ_DESTRUCT(&dummy); } #endif @@ -131,6 +133,26 @@ int orte_plm_base_setup_job(orte_job_t *jdata) exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } + /* get the orted job data object */ + if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (orte_process_info.num_procs != jdatorted->num_procs) { + /* more daemons are being launched - update the routing tree to + * ensure that the HNP knows how to route messages via + * the daemon routing tree - this needs to be done + * here to avoid potential race conditions where the HNP + * hasn't unpacked its launch message prior to being + * asked to communicate. + */ + orte_process_info.num_procs = jdatorted->num_procs; + if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /*** RHC: USER REQUEST TO TIE-OFF STDXXX TO /DEV/NULL *** WILL BE SENT IN LAUNCH MESSAGE AS PART OF CONTROLS FIELD. *** SO IF USER WANTS NO IO BEING SENT AROUND, THE ORTEDS @@ -356,13 +378,6 @@ static void process_orted_launch_report(int fd, short event, void *data) pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri); free(rml_uri); - /* set the route to be direct */ - if (ORTE_SUCCESS != (rc = orte_routed.update_route(&mev->sender, &mev->sender))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - /* get the remote arch */ idx = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &arch, &idx, OPAL_INT32))) { @@ -464,17 +479,6 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons) "%s plm:base:daemon_callback completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* all done launching - update the num_procs in my local structure if required - * so that any subsequent communications are correctly routed - */ - if (orte_process_info.num_procs != jdatorted->num_procs) { - orte_process_info.num_procs = jdatorted->num_procs; - /* update the routing tree */ - if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) { - ORTE_ERROR_LOG(rc); - } - } - /* if a tree-launch was underway, clear out the cmd */ if (NULL != orte_tree_launch_cmd) { OBJ_RELEASE(orte_tree_launch_cmd); @@ -505,7 +509,6 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data) pid_t pid; orte_job_t *jdata; orte_proc_t **procs; - orte_process_name_t proc; int rc; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, @@ -540,9 +543,6 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data) } procs = (orte_proc_t**)(jdata->procs->addr); - /* setup the process name */ - proc.jobid = jobid; - /* the daemon will report the vpid, state, and pid of each * process it launches - we need the pid in particular so * that any debuggers can attach to the process @@ -575,23 +575,6 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data) goto CLEANUP; } - /* it is possible for a race condition to exist when the HNP does not have - * local procs whereby the HNP will need to communicate to a remote - * proc before it decodes the launch message itself and sets all the routes. - * This has been seen in cases where no local procs are launched and - * a debugger needs to attach to the job. - * To support that situation, go ahead and update the route here - */ - proc.vpid = vpid; - /* if the sender is me, the route is direct to avoid infinite loops. We - * know the jobid is the same since the sender was another daemon - */ - if (mev->sender.vpid == ORTE_PROC_MY_NAME->vpid) { - orte_routed.update_route(&proc, &proc); - } else { - orte_routed.update_route(&proc, &mev->sender); - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:app_report_launched for proc %s from daemon %s: pid %lu state %0x exit %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), diff --git a/orte/mca/plm/base/plm_base_orted_cmds.c b/orte/mca/plm/base/plm_base_orted_cmds.c index d97fdd74df..8edc01adba 100644 --- a/orte/mca/plm/base/plm_base_orted_cmds.c +++ b/orte/mca/plm/base/plm_base_orted_cmds.c @@ -87,11 +87,10 @@ static void send_callback(int status, } } -int orte_plm_base_orted_exit(void) +int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command) { int rc; opal_buffer_t cmd; - orte_daemon_cmd_flag_t command = ORTE_DAEMON_EXIT_CMD; orte_job_t *daemons; orte_proc_t **procs; @@ -99,8 +98,8 @@ int orte_plm_base_orted_exit(void) "%s plm:base:orted_cmd sending orted_exit commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* flag that a shutdown is in progress so all heartbeats stop */ - orte_shutdown_in_progress = true; + /* stop all heartbeats */ + orte_heartbeat_rate = 0; OBJ_CONSTRUCT(&cmd, opal_buffer_t); @@ -139,9 +138,11 @@ int orte_plm_base_orted_exit(void) "%s plm:base:orted_cmd:orted_exit abnormal term ordered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* be sure I get the command */ - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmd, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - + /* turn off message routing - no way to guarantee that + * the route still exists + */ + orte_routing_is_enabled = false; + /* now send the command one daemon at a time using a non-blocking * send - let the callback function keep track of how many * complete - it will delete the event if they all do. @@ -195,14 +196,19 @@ int orte_plm_base_orted_exit(void) ev = NULL; } + /* be sure I get the command */ + ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmd, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); + /* if all the sends didn't go, or we couldn't send to * all daemons, then report that */ if (num_reported < num_being_sent || num_being_sent < (daemons->num_procs-1)) { + OBJ_DESTRUCT(&cmd); return ORTE_ERR_SILENT; } /* if all sends went out, return success */ + OBJ_DESTRUCT(&cmd); return ORTE_SUCCESS; } diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index 5df5ae7e9f..582f82f084 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -33,6 +33,7 @@ #include "opal/dss/dss_types.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/rml/rml_types.h" +#include "orte/mca/odls/odls_types.h" #include "orte/runtime/orte_globals.h" @@ -86,7 +87,7 @@ ORTE_DECLSPEC void orte_plm_base_start_heart(void); /** * Utilities for plm components that use proxy daemons */ -ORTE_DECLSPEC int orte_plm_base_orted_exit(void); +ORTE_DECLSPEC int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command); ORTE_DECLSPEC int orte_plm_base_orted_kill_local_procs(orte_jobid_t job); ORTE_DECLSPEC int orte_plm_base_orted_signal_local_procs(orte_jobid_t job, int32_t signal); diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index 8946ce8c1e..ef0c19d512 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -614,7 +614,7 @@ int plm_ccp_terminate_orteds() int rc; /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index 21975b8835..10cd3d7b09 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -397,7 +397,7 @@ static int plm_lsf_terminate_orteds(void) int rc; /* tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index 7c9c059aca..724e686a8e 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -889,7 +889,7 @@ int orte_plm_process_terminate_orteds(void) int rc; /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 854c1798cb..2b8f1a445a 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -79,6 +79,7 @@ #include "orte/util/nidmap.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/ess/ess.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/routed/routed.h" @@ -767,28 +768,6 @@ static void ssh_child(int argc, char **argv, static opal_buffer_t collected_uris; -static int construct_daemonmap(opal_buffer_t *data) -{ - opal_byte_object_t *bo; - orte_std_cntr_t cnt; - int rc; - - /* extract the byte object holding the daemonmap */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* unpack the nodemap - this will free the bytes in bo */ - if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo, &orte_daemonmap))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - - /* * launch a set of daemons from a remote daemon */ @@ -796,16 +775,16 @@ static int remote_spawn(opal_buffer_t *launch) { opal_list_item_t *item; orte_vpid_t vpid; - orte_nid_t **nodes; int node_name_index1; int proc_vpid_index; char **argv = NULL; - char *prefix; + char *prefix, *hostname; int argc; int rc; bool failed_launch = true; pid_t pid; orte_std_cntr_t n; + opal_byte_object_t *bo; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: remote spawn called", @@ -818,15 +797,17 @@ static int remote_spawn(opal_buffer_t *launch) goto cleanup; } - /* construct the daemonmap, if required - the decode function - * will know what to do - */ - if (ORTE_SUCCESS != (rc = construct_daemonmap(launch))) { + /* extract the byte object holding the nidmap */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &bo, &n, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* update our nidmap - this will free data in the byte object */ + if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { ORTE_ERROR_LOG(rc); goto cleanup; } - nodes = (orte_nid_t**)orte_daemonmap.addr; - vpid=ORTE_PROC_MY_NAME->vpid; /* clear out any previous child info */ while (NULL != (item = opal_list_remove_first(&mca_plm_rsh_component.children))) { @@ -863,15 +844,16 @@ static int remote_spawn(opal_buffer_t *launch) orte_namelist_t *child = (orte_namelist_t*)item; vpid = child->name.vpid; - if (NULL == nodes[vpid]) { - opal_output(0, "%s NULL in daemonmap at position %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)vpid); + /* get the host where this daemon resides */ + if (NULL == (hostname = orte_ess.proc_get_hostname(&child->name))) { + opal_output(0, "%s unable to get hostname for daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(vpid)); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } free(argv[node_name_index1]); - argv[node_name_index1] = strdup(nodes[vpid]->name); + argv[node_name_index1] = strdup(hostname); /* fork a child to exec the rsh/ssh session */ pid = fork(); @@ -886,7 +868,7 @@ static int remote_spawn(opal_buffer_t *launch) OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: launching on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nodes[vpid]->name)); + hostname)); /* do the ssh launch - this will exit if it fails */ ssh_child(argc, argv, vpid, proc_vpid_index); @@ -1334,8 +1316,10 @@ int orte_plm_rsh_terminate_orteds(void) { int rc; - /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + /* now tell them to die - we need them to "phone home", though, + * so we can know that they have exited + */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 5eecd9c900..ef875bed37 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -445,7 +445,7 @@ static int plm_slurm_terminate_orteds(void) orte_wait_cb_cancel(srun_pid); /* tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/submit/pls_submit_module.c b/orte/mca/plm/submit/pls_submit_module.c index fcbdf790c6..fda2b5a0a3 100644 --- a/orte/mca/plm/submit/pls_submit_module.c +++ b/orte/mca/plm/submit/pls_submit_module.c @@ -967,7 +967,7 @@ int orte_plm_submit_terminate_orteds(void) int rc; /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 244cd509c8..04bfbd1770 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -451,7 +451,7 @@ int plm_tm_terminate_orteds(void) int rc; /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/plm/tmd/plm_tmd_module.c b/orte/mca/plm/tmd/plm_tmd_module.c index 588593f7f9..a559d23256 100644 --- a/orte/mca/plm/tmd/plm_tmd_module.c +++ b/orte/mca/plm/tmd/plm_tmd_module.c @@ -456,7 +456,7 @@ int plm_tmd_terminate_orteds(void) int rc; /* now tell them to die! */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) { + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) { ORTE_ERROR_LOG(rc); } diff --git a/orte/mca/rml/base/rml_base_contact.c b/orte/mca/rml/base/rml_base_contact.c index 8fbf6487db..3c6c48505a 100644 --- a/orte/mca/rml/base/rml_base_contact.c +++ b/orte/mca/rml/base/rml_base_contact.c @@ -72,11 +72,12 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) orte_vpid_t num_procs; char *rml_uri; orte_process_name_t name; + bool got_name; int rc; - orte_jobid_t jobid=ORTE_JOBID_INVALID; /* unpack the data for each entry */ num_procs = 0; + name.jobid = ORTE_JOBID_INVALID; cnt = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(data, &rml_uri, &cnt, OPAL_STRING))) { @@ -92,24 +93,21 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) free(rml_uri); return(rc); } - /* extract the proc's name */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) { - ORTE_ERROR_LOG(rc); - free(rml_uri); - return rc; + if (!got_name) { + /* we only get an update from a single jobid - the command + * that creates these doesn't cross jobid boundaries - so + * record it here + */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) { + ORTE_ERROR_LOG(rc); + free(rml_uri); + return rc; + } + got_name = true; } free(rml_uri); - /* update the route - in this case, always set it to direct routing - * since we were given the contact info - */ - orte_routed.update_route(&name, &name); } - /* we only get an update from a single jobid - the command - * that creates these doesn't cross jobid boundaries - so - * record it here - */ - jobid = name.jobid; /* track how many procs were in the message */ ++num_procs; } @@ -124,7 +122,7 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) * changed since we were initially launched. Thus, update the num_procs * in our process_info struct so we can correctly route any messages */ - if (ORTE_PROC_MY_NAME->jobid == jobid && + if (ORTE_PROC_MY_NAME->jobid == name.jobid && orte_process_info.daemon && orte_process_info.num_procs < num_procs) { orte_process_info.num_procs = num_procs; diff --git a/orte/mca/routed/base/base.h b/orte/mca/routed/base/base.h index 3faf64cea0..99058f773b 100644 --- a/orte/mca/routed/base/base.h +++ b/orte/mca/routed/base/base.h @@ -14,6 +14,8 @@ #include "orte_config.h" #include "opal/mca/mca.h" +#include "opal/class/opal_bitmap.h" + #include "orte/mca/routed/routed.h" BEGIN_C_DECLS @@ -22,6 +24,15 @@ ORTE_DECLSPEC int orte_routed_base_open(void); #if !ORTE_DISABLE_FULL_SUPPORT +/* struct for tracking routing trees */ +typedef struct { + opal_list_item_t super; + orte_vpid_t vpid; + opal_bitmap_t relatives; +} orte_routed_tree_t; +ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_routed_tree_t); + + /* * Global functions for the ROUTED */ diff --git a/orte/mca/routed/base/routed_base_components.c b/orte/mca/routed/base/routed_base_components.c index d7e435bd21..f15eedb871 100644 --- a/orte/mca/routed/base/routed_base_components.c +++ b/orte/mca/routed/base/routed_base_components.c @@ -40,6 +40,18 @@ int orte_routed_base_open(void) #else +static void construct(orte_routed_tree_t *rt) +{ + rt->vpid = ORTE_VPID_INVALID; + OBJ_CONSTRUCT(&rt->relatives, opal_bitmap_t); +} +static void destruct(orte_routed_tree_t *rt) +{ + OBJ_DESTRUCT(&rt->relatives); +} +OBJ_CLASS_INSTANCE(orte_routed_tree_t, opal_list_item_t, + construct, destruct); + int orte_routed_base_output = -1; orte_routed_module_t orte_routed; opal_list_t orte_routed_base_components; diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 826c0716d8..ca2f986f11 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -20,6 +20,7 @@ #include "opal/util/bit_ops.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls_types.h" @@ -74,8 +75,7 @@ orte_routed_module_t orte_routed_binomial_module = { }; /* local globals */ -static opal_hash_table_t peer_list; -static opal_hash_table_t vpid_wildcard_list; +static opal_hash_table_t jobfam_list; static orte_process_name_t wildcard_route; static opal_condition_t cond; static opal_mutex_t lock; @@ -88,11 +88,8 @@ static bool ack_recvd; static int init(void) { - OBJ_CONSTRUCT(&peer_list, opal_hash_table_t); - opal_hash_table_init(&peer_list, 128); - - OBJ_CONSTRUCT(&vpid_wildcard_list, opal_hash_table_t); - opal_hash_table_init(&vpid_wildcard_list, 128); + OBJ_CONSTRUCT(&jobfam_list, opal_hash_table_t); + opal_hash_table_init(&jobfam_list, 128); wildcard_route.jobid = ORTE_NAME_INVALID->jobid; wildcard_route.vpid = ORTE_NAME_INVALID->vpid; @@ -114,8 +111,6 @@ static int init(void) static int finalize(void) { int rc; - uint64_t key; - void * value, *node, *next_node; opal_list_item_t *item; /* if I am an application process, indicate that I am @@ -135,22 +130,7 @@ static int finalize(void) orte_routed_base_comm_stop(); } - /* don't destruct the routes until *after* we send the - * sync as the oob will be asking us how to route - * the message! - */ - rc = opal_hash_table_get_first_key_uint64(&peer_list, - &key, &value, &node); - while(OPAL_SUCCESS == rc) { - if(NULL != value) { - free(value); - } - rc = opal_hash_table_get_next_key_uint64(&peer_list, - &key, &value, node, &next_node); - node = next_node; - } - OBJ_DESTRUCT(&peer_list); - OBJ_DESTRUCT(&vpid_wildcard_list); + OBJ_DESTRUCT(&jobfam_list); /* destruct the global condition and lock */ OBJ_DESTRUCT(&cond); OBJ_DESTRUCT(&lock); @@ -208,13 +188,13 @@ static int delete_route(orte_process_name_t *proc) /* see if this proc is present - it will have a wildcard vpid, * so we have to look for it with that condition */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(proc->jobid), (void**)&route_copy); if (ORTE_SUCCESS == rc && NULL != route_copy) { /* proc is present - remove the data */ free(route_copy); - rc = opal_hash_table_remove_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_remove_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(proc->jobid)); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); @@ -226,55 +206,12 @@ static int delete_route(orte_process_name_t *proc) return ORTE_SUCCESS; } - /* THIS CAME FROM OUR OWN JOB FAMILY... */ - - /* treat vpid wildcards separately so they go onto the correct list */ - if (proc->jobid != ORTE_JOBID_WILDCARD && - proc->vpid == ORTE_VPID_WILDCARD) { - /* see if this target is already present - it will have a wildcard vpid, - * so we have to look for it on that list - */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - proc->jobid, - (void**)&route_copy); - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* proc is present - remove the data */ - free(route_copy); - rc = opal_hash_table_remove_value_uint32(&vpid_wildcard_list, proc->jobid); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - /* not already present - nothing to do */ - return ORTE_SUCCESS; - } - - /* check for an exact match */ - if (proc->jobid != ORTE_JOBID_WILDCARD && - proc->vpid != ORTE_VPID_WILDCARD) { - /* see if this route already exists in our table */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(proc), - (void**)&route_copy); - - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* proc is present - remove the data */ - free(route_copy); - rc = opal_hash_table_remove_value_uint64(&peer_list, orte_util_hash_name(proc)); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - /* not already present - nothing to do */ - return ORTE_SUCCESS; - } - - /* this must be a process that doesn't match any of the - * prior conditions - sorry! + /* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing + * to do here. The routes will be redefined when we update + * the routing tree */ - return ORTE_ERR_NOT_SUPPORTED; + + return ORTE_SUCCESS; } static int update_route(orte_process_name_t *target, @@ -325,7 +262,7 @@ static int update_route(orte_process_name_t *target, /* see if this target is already present - it will have a wildcard vpid, * so we have to look for it with that condition */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&route_copy); if (ORTE_SUCCESS == rc && NULL != route_copy) { @@ -333,7 +270,7 @@ static int update_route(orte_process_name_t *target, * in case it has changed */ *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_set_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), route_copy); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); @@ -344,7 +281,7 @@ static int update_route(orte_process_name_t *target, /* not there, so add the route FOR THE JOB FAMILY*/ route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_set_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), route_copy); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); @@ -354,81 +291,15 @@ static int update_route(orte_process_name_t *target, /* THIS CAME FROM OUR OWN JOB FAMILY... */ - /* treat vpid wildcards separately so they go onto the correct list */ - if (target->jobid != ORTE_JOBID_WILDCARD && - target->vpid == ORTE_VPID_WILDCARD) { - /* see if this target is already present - it will have a wildcard vpid, - * so we have to look for it on that list - */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - target->jobid, - (void**)&route_copy); - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* target already present - update the route info - * in case it has changed - */ - *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, - target->jobid, route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* not already present, so let's add it */ - route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, - target->jobid, route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } + opal_output(0, "%s CALL TO UPDATE ROUTE FOR OWN JOB FAMILY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* check for an exact match */ - if (target->jobid != ORTE_JOBID_WILDCARD && - target->vpid != ORTE_VPID_WILDCARD) { - /* see if this route already exists in our table */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(target), - (void**)&route_copy); - - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* target already present - update the route info - * in case it has changed - */ - *route_copy = *route; - rc = opal_hash_table_set_value_uint64(&peer_list, - orte_util_hash_name(target), route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* not present - add it to the table */ - route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - *route_copy = *route; - rc = opal_hash_table_set_value_uint64(&peer_list, - orte_util_hash_name(target), route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* this must be a process that doesn't match any of the - * prior conditions - sorry! - */ return ORTE_ERR_NOT_SUPPORTED; } static orte_process_name_t get_route(orte_process_name_t *target) { - orte_process_name_t *ret; + orte_process_name_t *ret, daemon; int rc; if (target->jobid == ORTE_JOBID_INVALID || @@ -450,6 +321,8 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } + /****** HNP AND DAEMONS ONLY ******/ + /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ @@ -461,7 +334,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&ret); if (ORTE_SUCCESS == rc) { /* got a good result - return it */ @@ -474,24 +347,39 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* THIS CAME FROM OUR OWN JOB FAMILY... */ - /* check exact matches */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(target), (void**)&ret); - if (ORTE_SUCCESS == rc) { - /* got a good result - return it */ + /* if this is going to the HNP, send direct */ + if (ORTE_PROC_MY_HNP->jobid == target->jobid && + ORTE_PROC_MY_HNP->vpid == target->vpid) { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing not enabled - going direct", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ret = target; goto found; } - /* didn't find an exact match - check to see if a route for this job was defined */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - target->jobid, (void**)&ret); - if (ORTE_SUCCESS == rc) { - /* got a good result - return it */ + daemon.jobid = ORTE_PROC_MY_NAME->jobid; + /* find out what daemon hosts this proc */ + if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { + /* we don't recognize this one - if we are the HNP, all + * we can do is abort + */ + if (orte_process_info.hnp) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ret = ORTE_NAME_INVALID; + goto found; + } + /* if we are not the HNP, send it to the wildcard location */ + ret = &wildcard_route; goto found; } - /* default to wildcard route */ - ret = &wildcard_route; + /* if the daemon is me, then send direct to the target! */ + if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { + ret = target; + } else { + /* otherwise, we send it directly to that daemon */ + ret = &daemon; + } found: OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, @@ -645,20 +533,13 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) return rc; } - /* if ndat is NULL, then this is being called during init, - * so just seed the routing table with a path back to the HNP... - */ - if (ORTE_SUCCESS != (rc = update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP))) { - ORTE_ERROR_LOG(rc); - return rc; - } /* set the wildcard route for anybody whose name we don't recognize * to be the HNP */ wildcard_route.jobid = ORTE_PROC_MY_HNP->jobid; wildcard_route.vpid = ORTE_PROC_MY_HNP->vpid; - /* set our lifeline to the the HNP - we will abort if that connection is lost */ + /* set our lifeline to the HNP - we will abort if that connection is lost */ lifeline = ORTE_PROC_MY_HNP; /* daemons will send their contact info back to the HNP as @@ -923,33 +804,16 @@ static int route_lost(const orte_process_name_t *route) } - -/******* stub functions - to be implemented ******/ static bool route_is_defined(const orte_process_name_t *target) { + /* find out what daemon hosts this proc */ + if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { + return false; + } + return true; } -/*************************************/ -typedef struct { - opal_list_item_t super; - orte_vpid_t vpid; - opal_bitmap_t relatives; -} orte_routed_tree_t; - -static void construct(orte_routed_tree_t *rt) -{ - rt->vpid = ORTE_VPID_INVALID; - OBJ_CONSTRUCT(&rt->relatives, opal_bitmap_t); -} -static void destruct(orte_routed_tree_t *rt) -{ - OBJ_DESTRUCT(&rt->relatives); -} -OBJ_CLASS_INSTANCE(orte_routed_tree_t, opal_list_item_t, - construct, destruct); - - static int binomial_tree(int rank, int parent, int me, int num_procs, int *nchildren, opal_list_t *childrn, opal_bitmap_t *relatives) { @@ -1083,10 +947,47 @@ static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children) opal_list_append(children, &nm->item); } } + /* return my parent's vpid */ return my_parent.vpid; } +static bool proc_is_below(orte_vpid_t root, orte_vpid_t target) +{ + opal_list_item_t *item; + orte_routed_tree_t *child; + + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return false; + } + + /* quick check: if root == target, then the answer is always true! */ + if (root == target) { + return true; + } + + /* check the list of children to see if either their vpid + * matches target, or the target bit is set in their bitmap + */ + + /* first find the specified child */ + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + if (child->vpid == root) { + /* now see if the target lies below this child */ + return opal_bitmap_is_set_bit(&child->relatives, target); + } + } + + /* only get here if we have no children or we didn't find anything */ + return false; +} + static int get_wireup_info(opal_buffer_t *buf) { int rc; @@ -1115,42 +1016,6 @@ static int get_wireup_info(opal_buffer_t *buf) return ORTE_SUCCESS; } -static bool proc_is_below(orte_vpid_t root, orte_vpid_t target) -{ - opal_list_item_t *item; - orte_routed_tree_t *child; - - /* if I am anything other than a daemon or the HNP, this - * is a meaningless command as I am not allowed to route - */ - if (!orte_process_info.daemon && !orte_process_info.hnp) { - return false; - } - - /* quick check: if root == target, then the answer is always true! */ - if (root == target) { - return true; - } - - /* check the list of children to see if either their vpid - * matches target, or the target bit is set in their bitmap - */ - - /* first find the specified child */ - for (item = opal_list_get_first(&my_children); - item != opal_list_get_end(&my_children); - item = opal_list_get_next(item)) { - child = (orte_routed_tree_t*)item; - if (child->vpid == root) { - /* now see if the target lies below this child */ - return opal_bitmap_is_set_bit(&child->relatives, target); - } - } - - /* only get here if we have no children or we didn't find anything */ - return false; -} - #if OPAL_ENABLE_FT == 1 static int binomial_ft_event(int state) diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c deleted file mode 100644 index cbea363f20..0000000000 --- a/orte/mca/routed/direct/routed_direct.c +++ /dev/null @@ -1,988 +0,0 @@ -/* - * Copyright (c) 2007 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/class/opal_list.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/threads/condition.h" -#include "opal/dss/dss.h" -#include "opal_stdint.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/odls/odls_types.h" -#include "orte/mca/rml/rml.h" -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/runtime.h" -#include "orte/runtime/orte_wait.h" -#include "orte/util/show_help.h" - -#include "orte/mca/rml/base/rml_contact.h" - -#include "orte/mca/routed/base/base.h" -#include "routed_direct.h" - -/* Local static variables */ -static opal_condition_t cond; -static opal_mutex_t lock; -static opal_hash_table_t peer_list; -static opal_buffer_t *recv_buf=NULL; -static bool ack_recvd, msg_recvd; -static orte_process_name_t *lifeline=NULL; - - -/* API functions */ -static int init(void); -static int finalize(void); -static int delete_route(orte_process_name_t *proc); -static int update_route(orte_process_name_t *target, - orte_process_name_t *route); -static orte_process_name_t get_route(orte_process_name_t *target); -static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); -static int route_lost(const orte_process_name_t *route); -static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(void); -static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children); -static bool proc_is_below(orte_vpid_t root, orte_vpid_t target); -static int get_wireup_info(opal_buffer_t *buf); -static int warmup_routes(void); - -#if OPAL_ENABLE_FT == 1 -static int direct_ft_event(int state); -#endif - -orte_routed_module_t orte_routed_direct_module = { - init, - finalize, - delete_route, - update_route, - get_route, - init_routes, - warmup_routes, - route_lost, - route_is_defined, - update_routing_tree, - get_routing_tree, - proc_is_below, - get_wireup_info, -#if OPAL_ENABLE_FT == 1 - direct_ft_event -#else - NULL -#endif -}; - -static int init(void) -{ - /* setup the global condition and lock */ - OBJ_CONSTRUCT(&cond, opal_condition_t); - OBJ_CONSTRUCT(&lock, opal_mutex_t); - - OBJ_CONSTRUCT(&peer_list, opal_hash_table_t); - opal_hash_table_init(&peer_list, 128); - - lifeline = NULL; - - return ORTE_SUCCESS; -} - -static int finalize(void) -{ - int rc; - uint32_t key; - void *value, *node, *next_node; - - /* if I am the HNP, I need to stop the comm recv */ - if (orte_process_info.hnp) { - orte_routed_base_comm_stop(); - } - - /* if I am an application process (but NOT a tool), indicate that I am - * truly finalizing prior to departure - */ - if (!orte_process_info.hnp && - !orte_process_info.daemon && - !orte_process_info.tool) { - if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(false))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - /* don't destruct the routes until *after* we send the - * sync as the oob will be asking us how to route - * the message! - */ - rc = opal_hash_table_get_first_key_uint32(&peer_list, - &key, &value, &node); - while(OPAL_SUCCESS == rc) { - if(NULL != value) { - free(value); - } - rc = opal_hash_table_get_next_key_uint32(&peer_list, - &key, &value, node, &next_node); - node = next_node; - } - OBJ_DESTRUCT(&peer_list); - - /* cleanup the global condition */ - OBJ_DESTRUCT(&cond); - OBJ_DESTRUCT(&lock); - - lifeline = NULL; - - return ORTE_SUCCESS; -} - - -static int delete_route(orte_process_name_t *proc) -{ - orte_process_name_t *route_copy; - int rc; - - if (proc->jobid == ORTE_JOBID_INVALID || - proc->vpid == ORTE_VPID_INVALID) { - return ORTE_ERR_BAD_PARAM; - } - - /* if this isn't from a different job family, then there is - * nothing for us to do as all routes are direct - nothing - * is in the routing table - */ - if (ORTE_JOB_FAMILY(proc->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { - return ORTE_SUCCESS; - } - - /* if I am -not- the HNP or a tool, then I will automatically route - * anything to this job family via my HNP - so nothing to do - * here since nothing is in my routing table - */ - if (!orte_process_info.hnp && !orte_process_info.tool) { - return ORTE_SUCCESS; - } - - /* must need to look it up */ - rc = opal_hash_table_get_value_uint32(&peer_list, - ORTE_JOB_FAMILY(proc->jobid), - (void**)&route_copy); - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* proc is present - remove the data */ - free(route_copy); - rc = opal_hash_table_remove_value_uint32(&peer_list, - ORTE_JOB_FAMILY(proc->jobid)); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* wasn't here - nothing to do */ - return ORTE_SUCCESS; -} - -static int update_route(orte_process_name_t *target, - orte_process_name_t *route) -{ - orte_process_name_t *route_copy; - int rc; - - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID) { - return ORTE_ERR_BAD_PARAM; - } - - /* if this is from a different job family, then I need to - * track how to send messages to it - */ - if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { - - /* if I am -not- the HNP or a tool, then I will automatically route - * anything to this job family via my HNP - so nothing to do - * here, just return - */ - if (!orte_process_info.hnp && !orte_process_info.tool) { - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct_update: diff job family routing job %s --> %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(target->jobid), - ORTE_NAME_PRINT(route))); - - /* see if this target is already present */ - rc = opal_hash_table_get_value_uint32(&peer_list, - ORTE_JOB_FAMILY(target->jobid), - (void**)&route_copy); - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* target already present - update the route info - * in case it has changed - */ - *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&peer_list, - ORTE_JOB_FAMILY(target->jobid), route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* not there, so add the route FOR THE JOB FAMILY*/ - route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&peer_list, - ORTE_JOB_FAMILY(target->jobid), route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* if it came from our own job family, there is nothing to do */ - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct_update: %s --> %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(target), - ORTE_NAME_PRINT(route))); - - return ORTE_SUCCESS; -} - - -static orte_process_name_t get_route(orte_process_name_t *target) -{ - orte_process_name_t *ret; - int rc; - - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID) { - ret = ORTE_NAME_INVALID; - goto found; - } - - /* if it is me, then the route is just direct */ - if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { - ret = target; - goto found; - } - - if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { - /* if I am -not- the HNP or a tool, route this via the HNP */ - if (!orte_process_info.hnp && !orte_process_info.tool) { - ret = ORTE_PROC_MY_HNP; - goto found; - } - - /* if I am the HNP or a tool, then I stored a route to this proc, so look it up */ - rc = opal_hash_table_get_value_uint32(&peer_list, - ORTE_JOB_FAMILY(target->jobid), (void**)&ret); - if (ORTE_SUCCESS == rc) { - /* got a good result - return it */ - goto found; - } - /* not found - so we have no route */ - ret = ORTE_NAME_INVALID; - goto found; - } else { - /* if it is our own job family, just go direct */ - ret = target; - } - -found: - OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s routed_direct_get(%s) --> %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(target), - ORTE_NAME_PRINT(ret))); - return *ret; -} - -static int process_callback(orte_jobid_t job, opal_buffer_t *buffer) -{ - orte_proc_t **procs; - orte_job_t *jdata; - orte_process_name_t name; - opal_buffer_t buf; - orte_std_cntr_t cnt; - char *rml_uri; - int rc; - - /* lookup the job object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - procs = (orte_proc_t**)jdata->procs->addr; - - /* unpack the data for each entry */ - cnt = 1; - while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) { - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routed_direct:callback got uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == rml_uri) ? "NULL" : rml_uri)); - - if (rml_uri == NULL) continue; - - /* set the contact info into the hash table */ - if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) { - ORTE_ERROR_LOG(rc); - free(rml_uri); - continue; - } - /* extract the proc's name */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) { - ORTE_ERROR_LOG(rc); - free(rml_uri); - continue; - } - /* the procs are stored in vpid order, so update the record */ - procs[name.vpid]->rml_uri = strdup(rml_uri); - free(rml_uri); - - /* update the proc state */ - if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) { - procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING; - } - - ++jdata->num_reported; - cnt = 1; - } - if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* if all procs have reported, then send out the info to complete the exchange */ - if (jdata->num_reported == jdata->num_procs) { - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct:callback trigger fired on job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); - - /* update the job state */ - if (jdata->state < ORTE_JOB_STATE_RUNNING) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - - /* now send to the procs so they release from their barrier */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* pack the RML contact info for each proc */ - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(jdata->jobid, &buf))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* send it to all procs via xcast */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jdata->jobid, &buf, ORTE_RML_TAG_INIT_ROUTES))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - } - - return ORTE_SUCCESS; -} - -/* HANDLE ACK MESSAGES FROM AN HNP */ -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - -static void recv_ack(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); -} - -/* HANDLE PEER CONTACT INFO MESSAGE */ -static void process_msg(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - - /* copy the data to the recv buffer */ - opal_dss.copy_payload(recv_buf, mev->buffer); - - /* acknowledge receipt */ - msg_recvd = true; - - /* cleanup event */ - OBJ_RELEASE(mev); -} - -static void recv_msg(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); -} - - -static int init_routes(orte_jobid_t job, opal_buffer_t *ndata) -{ - /* the direct module just sends direct to everyone, so it requires - * that the RML get loaded with contact info from all of our peers. - * We also look for and provide contact info for our local daemon - * so we can use it if needed - */ - - /* if I am a tool, then I stand alone - there is nothing to do */ - if (orte_process_info.tool) { - return ORTE_SUCCESS; - } - - /* if I am a daemon... */ - if (orte_process_info.daemon ) { - int rc; - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct: init routes for daemon job %s\n\thnp_uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); - if (NULL == ndata) { - /* indicates this is being called during orte_init. - * since the daemons in the direct component don't route messages, - * there is nothing for them to do - daemons will send their - * contact info as part of the message confirming they are ready - * to go. Just get the HNP's name for possible later use - */ - if (NULL == orte_process_info.my_hnp_uri) { - /* fatal error */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - /* set the contact info into the hash table */ - if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* we don't have to update the route as the direct component is - * always "direct" - */ - - /* set our lifeline as the HNP - we will abort if that connection fails */ - lifeline = ORTE_PROC_MY_HNP; - - return ORTE_SUCCESS; - } - - /* if ndata isn't NULL, then we are getting this as part of an - * update due to a dynamic spawn of more daemons. We need to - * pass the buffer on to the rml for processing so the contact - * info can be added to our hash tables - thus allowing us to - * execute routing xcasts, for example. - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndata))) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* if I am the HNP... */ - if (orte_process_info.hnp) { - /* if this is for my own job, we handle - * updates of daemon contact info separately, so this - * shouldn't get called during daemon startup. This situation - * would occur, though, when we are doing orte_init within the HNP - * itself, but we store our data during orte_init anyway - * However, for the direct component, I do have to make myself - * available for processing incoming rml contact info messages - * from the procs - so setup that receive here - */ - int rc; - - if (ORTE_PROC_MY_NAME->jobid == job) { - if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) { - ORTE_ERROR_LOG(rc); - return rc; - } - } else { - /* if its from some other job, then this is info I need - * to process - */ - if (ORTE_SUCCESS != (rc = process_callback(job, ndata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - - /* I do not have a lifeline */ - lifeline = NULL; - return ORTE_SUCCESS; - } - - - { /* MUST BE A PROC */ - /* if ndata != NULL, then this is being invoked by the proc to - * init a route to a specified process. For example, in OMPI's - * publish/subscribe procedures, the DPM framework looks for an - * mca param containing the global ompi-server's uri. This info - * will come here so the proc can setup a route to - * the server - */ - if (NULL != ndata) { - int rc; - orte_std_cntr_t cnt; - orte_rml_cmd_flag_t command; - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct: init routes to jobid %s w/non-NULL data", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - /* if this is for my job family, then we update my contact info - * so I can talk directly to my fellow family members - */ - if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(job)) { - /* extract the RML command from the buffer and discard it - this - * command is in there for compatibility with other routed - * components but is not needed here - */ - cnt=1; - opal_dss.unpack(ndata, &command, &cnt, ORTE_RML_CMD); - - /* Set the contact info in the RML - this won't actually establish - * the connection, but just tells the RML how to reach the - * target proc(s) - */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; - } - - /* if this is for a different job family, then we route via our HNP - * to minimize connection counts to entities such as ompi-server, so - * start by sending the contact info to the HNP for update - */ - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct_init_routes: diff job family - sending update to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); - - if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, ndata, - ORTE_RML_TAG_RML_INFO_UPDATE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* wait right here until the HNP acks the update to ensure that - * any subsequent messaging can succeed - */ - ack_recvd = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, - ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct_init_routes: ack recvd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* our get_route function automatically routes all messages for - * other job families via the HNP, so nothing more to do here - */ - return ORTE_SUCCESS; - } - - { - /* if ndata=NULL, then we are being called during orte_init. In this - * case, we need to setup a few critical pieces of info - */ - int rc; - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), - (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, - (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); - - /* get the local daemon's uri - this may not always be provided, so - * don't error if it isn't there - */ - if (NULL != orte_process_info.my_daemon_uri) { - /* Set the contact info in the RML and establish - * the connection so the daemon knows how to reach us. - * We have to do this as any non-direct xcast will come - * via our local daemon - and if it doesn't know how to - * reach us, then it will error out the message - */ - /* set the contact info into the hash table */ - if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_daemon_uri))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - - /* extract the daemon's name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, - ORTE_PROC_MY_DAEMON, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* we don't have to update the route as the direct component is - * always "direct" - */ - } - - /* setup the hnp - this must always be provided, so - * error if it isn't there as we won't know how to complete - * the wireup for the direct component - */ - if (NULL == orte_process_info.my_hnp_uri) { - /* fatal error */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routed_direct_init: set hnp contact info and name", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* set the contact info into the hash table */ - if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - - /* extract the hnp name and store it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, - ORTE_PROC_MY_HNP, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* declare the HNP as our "lifeline" - this means that we will automatically - * abort if we lose that connection - */ - lifeline = ORTE_PROC_MY_HNP; - - /* we don't have to update the route as the direct component is - * always "direct" - */ - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routed_direct_init: register sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* register myself to require that I finalize before exiting - * This also will cause the local orted to send our contact - * into to the HNP once all my local peers have registered - */ - if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routed_direct_init: wait to recv contact info for peers", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* now setup a blocking receive and wait right here until we get - * the contact info for all of our peers - */ - if (NULL != recv_buf) { - OBJ_RELEASE(recv_buf); - } - recv_buf = OBJ_NEW(opal_buffer_t); - msg_recvd = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_INIT_ROUTES, - ORTE_RML_NON_PERSISTENT, recv_msg, NULL); - - ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routed_direct_init: peer contact info recvd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* process it */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(recv_buf))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_RELEASE(recv_buf); - - return ORTE_SUCCESS; - } - } -} - -static int warmup_routes(void) -{ - struct iovec inmsg[1], outmsg[1]; - int i, world_size, world_rank, ret; - orte_process_name_t proc; - - /* if I am a daemon, tool, or HNP, do nothing */ - if (orte_process_info.daemon || - orte_process_info.hnp || - orte_process_info.tool) { - return ORTE_SUCCESS; - } - - return ORTE_SUCCESS; - - /* I am an application process. In this case, we - * do a semi-intelligent messaging scheme to - * force the sockets to be opened - */ - world_size = orte_process_info.num_procs; - world_rank = ORTE_PROC_MY_NAME->vpid; - proc.jobid = ORTE_PROC_MY_NAME->jobid; - for (i = 1 ; i <= world_size / 2 ; i ++) { - proc.vpid = (world_rank + i) % world_size; - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct_warmup: sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc))); - - /* sends do not wait for a match */ - ret = orte_rml.send(&proc, - outmsg, - 1, - ORTE_RML_TAG_WIREUP, - 0); - if (ret < 0) return ret; - - proc.vpid = (world_rank - i + world_size) % world_size; - - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, - "%s routed_direct_warmup: recv from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc))); - - ret = orte_rml.recv(&proc, - inmsg, - 1, - ORTE_RML_TAG_WIREUP, - 0); - if (ret < 0) return ret; - } - - return ORTE_SUCCESS; -} - -static int route_lost(const orte_process_name_t *route) -{ - /* if we lose the connection to the lifeline and we are NOT already, - * in finalize, tell the OOB to abort. - * NOTE: we cannot call abort from here as the OOB needs to first - * release a thread-lock - otherwise, we will hang!! - */ - if (!orte_finalizing && - NULL != lifeline && - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { - opal_output(0, "%s routed:direct: Connection to lifeline %s lost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(lifeline)); - return ORTE_ERR_FATAL; - } - - /* we don't care about this one, so return success */ - return ORTE_SUCCESS; -} - - -static bool route_is_defined(const orte_process_name_t *target) -{ - orte_process_name_t *ret; - int rc; - - if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { - /* we always have a route to our own job */ - return true; - } - - /* if the job family is different, check the peer list to see if a route - * has been defined - */ - rc = opal_hash_table_get_value_uint32(&peer_list, - ORTE_JOB_FAMILY(target->jobid), - (void**)&ret); - if (ORTE_SUCCESS == rc && NULL != ret) { - /* target present - we have a route */ - return true; - } - - /* if we get here, then we don't have a route */ - return false; -} - -/*************************************/ - - -static int update_routing_tree(void) -{ - /* if I am anything other than a daemon or the HNP, this - * is a meaningless command as I am not allowed to route - */ - if (!orte_process_info.daemon && !orte_process_info.hnp) { - return ORTE_ERR_NOT_SUPPORTED; - } - - /* nothing to do here as the routing tree is fixed */ - return ORTE_SUCCESS; -} - -static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children) -{ - orte_namelist_t *nm; - orte_job_t *jdata; - orte_vpid_t i, start; - - /* if I am anything other than a daemon or the HNP, this - * is a meaningless command as I am not allowed to route - */ - if (!orte_process_info.daemon && !orte_process_info.hnp) { - return ORTE_VPID_INVALID; - } - - /* if I am a daemon, I have no children and my - * parent is the HNP - */ - if (orte_process_info.daemon) { - return ORTE_PROC_MY_HNP->vpid; - } - - /* if we are the HNP, then the direct routing tree - * consists of every process in the job - indicate that by - * adding a proc name of the jobid and a wildcard vpid. The - * HNP is capable of looking up the vpid range for this job - */ - if (NULL != children) { - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_VPID_INVALID; - } - /* if this is to the daemons, don't include myself */ - if (ORTE_PROC_MY_NAME->jobid == job) { - start = 1; - } else { - start = 0; - } - for (i=start; i < jdata->num_procs; i++) { - nm = OBJ_NEW(orte_namelist_t); - nm->name.jobid = job; - nm->name.vpid = i; - opal_list_append(children, &nm->item); - } - } - - /* the parent of the HNP is invalid */ - return ORTE_VPID_INVALID; -} - -static bool proc_is_below(orte_vpid_t root, orte_vpid_t target) -{ - /* this is a flat routing tree - if I am not the HNP, then - * nobody is below - */ - if (!orte_process_info.hnp) { - return false; - } - /* if I am the HNP, then the route is through the root - * if the root is the target - */ - if (root == target) { - return true; - } - /* otherwise, not */ - return false; -} - -static int get_wireup_info(opal_buffer_t *buf) -{ - int rc; - - /* if I am anything other than the HNP, this - * is a meaningless command as I cannot get - * the requested info - */ - if (!orte_process_info.hnp) { - return ORTE_ERR_NOT_SUPPORTED; - } - - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - return rc; - } - - return ORTE_SUCCESS; -} - -#if OPAL_ENABLE_FT == 1 -static int direct_ft_event(int state) -{ - int ret, exit_status = ORTE_SUCCESS; - - /******** Checkpoint Prep ********/ - if(OPAL_CRS_CHECKPOINT == state) { - } - /******** Continue Recovery ********/ - else if (OPAL_CRS_CONTINUE == state ) { - } - /******** Restart Recovery ********/ - else if (OPAL_CRS_RESTART == state ) { - /* - * Re-exchange the routes - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - exit_status = ret; - goto cleanup; - } - } - else if (OPAL_CRS_TERM == state ) { - /* Nothing */ - } - else { - /* Error state = Nothing */ - } - - cleanup: - return exit_status; -} -#endif diff --git a/orte/mca/routed/direct/routed_direct_component.c b/orte/mca/routed/direct/routed_direct_component.c deleted file mode 100644 index 56869d1ca3..0000000000 --- a/orte/mca/routed/direct/routed_direct_component.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2007 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2004-2008 The Trustees of Indiana University. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - - -#include "routed_direct.h" - -static int orte_routed_direct_component_query(mca_base_module_t **module, int *priority); - - -/** - * component definition - */ -orte_routed_component_t mca_routed_direct_component = { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - { - ORTE_ROUTED_BASE_VERSION_2_0_0, - - "direct", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - NULL, - NULL, - orte_routed_direct_component_query - }, - { - /* This component can be checkpointed */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - -static int orte_routed_direct_component_query(mca_base_module_t **module, int *priority) -{ - *priority = 10; - *module = (mca_base_module_t *) &orte_routed_direct_module; - return ORTE_SUCCESS; -} diff --git a/orte/mca/routed/linear/routed_linear.c b/orte/mca/routed/linear/routed_linear.c index dca132e4b3..ad5e4a1021 100644 --- a/orte/mca/routed/linear/routed_linear.c +++ b/orte/mca/routed/linear/routed_linear.c @@ -18,6 +18,7 @@ #include "opal/class/opal_hash_table.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls_types.h" @@ -72,8 +73,7 @@ orte_routed_module_t orte_routed_linear_module = { }; /* local globals */ -static opal_hash_table_t peer_list; -static opal_hash_table_t vpid_wildcard_list; +static opal_hash_table_t jobfam_list; static orte_process_name_t wildcard_route; static opal_condition_t cond; static opal_mutex_t lock; @@ -84,11 +84,8 @@ static bool ack_recvd; static int init(void) { - OBJ_CONSTRUCT(&peer_list, opal_hash_table_t); - opal_hash_table_init(&peer_list, 128); - - OBJ_CONSTRUCT(&vpid_wildcard_list, opal_hash_table_t); - opal_hash_table_init(&vpid_wildcard_list, 128); + OBJ_CONSTRUCT(&jobfam_list, opal_hash_table_t); + opal_hash_table_init(&jobfam_list, 128); wildcard_route.jobid = ORTE_NAME_INVALID->jobid; wildcard_route.vpid = ORTE_NAME_INVALID->vpid; @@ -105,8 +102,6 @@ static int init(void) static int finalize(void) { int rc; - uint64_t key; - void * value, *node, *next_node; /* if I am an application process, indicate that I am * truly finalizing prior to departure @@ -125,22 +120,7 @@ static int finalize(void) orte_routed_base_comm_stop(); } - /* don't destruct the routes until *after* we send the - * sync as the oob will be asking us how to route - * the message! - */ - rc = opal_hash_table_get_first_key_uint64(&peer_list, - &key, &value, &node); - while(OPAL_SUCCESS == rc) { - if(NULL != value) { - free(value); - } - rc = opal_hash_table_get_next_key_uint64(&peer_list, - &key, &value, node, &next_node); - node = next_node; - } - OBJ_DESTRUCT(&peer_list); - OBJ_DESTRUCT(&vpid_wildcard_list); + OBJ_DESTRUCT(&jobfam_list); /* destruct the global condition and lock */ OBJ_DESTRUCT(&cond); OBJ_DESTRUCT(&lock); @@ -191,13 +171,13 @@ static int delete_route(orte_process_name_t *proc) /* see if this proc is present - it will have a wildcard vpid, * so we have to look for it with that condition */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(proc->jobid), (void**)&route_copy); if (ORTE_SUCCESS == rc && NULL != route_copy) { /* proc is present - remove the data */ free(route_copy); - rc = opal_hash_table_remove_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_remove_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(proc->jobid)); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); @@ -209,55 +189,12 @@ static int delete_route(orte_process_name_t *proc) return ORTE_SUCCESS; } - /* THIS CAME FROM OUR OWN JOB FAMILY... */ - - /* treat vpid wildcards separately so they go onto the correct list */ - if (proc->jobid != ORTE_JOBID_WILDCARD && - proc->vpid == ORTE_VPID_WILDCARD) { - /* see if this target is already present - it will have a wildcard vpid, - * so we have to look for it on that list - */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - proc->jobid, - (void**)&route_copy); - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* proc is present - remove the data */ - free(route_copy); - rc = opal_hash_table_remove_value_uint32(&vpid_wildcard_list, proc->jobid); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - /* not already present - nothing to do */ - return ORTE_SUCCESS; - } - - /* check for an exact match */ - if (proc->jobid != ORTE_JOBID_WILDCARD && - proc->vpid != ORTE_VPID_WILDCARD) { - /* see if this route already exists in our table */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(proc), - (void**)&route_copy); - - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* proc is present - remove the data */ - free(route_copy); - rc = opal_hash_table_remove_value_uint64(&peer_list, orte_util_hash_name(proc)); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - /* not already present - nothing to do */ - return ORTE_SUCCESS; - } - - /* this must be a process that doesn't match any of the - * prior conditions - sorry! + /* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing + * to do here. The routes will be redefined when we update + * the routing tree */ - return ORTE_ERR_NOT_SUPPORTED; + + return ORTE_SUCCESS; } static int update_route(orte_process_name_t *target, @@ -308,7 +245,7 @@ static int update_route(orte_process_name_t *target, /* see if this target is already present - it will have a wildcard vpid, * so we have to look for it with that condition */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&route_copy); if (ORTE_SUCCESS == rc && NULL != route_copy) { @@ -316,7 +253,7 @@ static int update_route(orte_process_name_t *target, * in case it has changed */ *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_set_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), route_copy); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); @@ -327,7 +264,7 @@ static int update_route(orte_process_name_t *target, /* not there, so add the route FOR THE JOB FAMILY*/ route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_set_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), route_copy); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); @@ -337,81 +274,15 @@ static int update_route(orte_process_name_t *target, /* THIS CAME FROM OUR OWN JOB FAMILY... */ - /* treat vpid wildcards separately so they go onto the correct list */ - if (target->jobid != ORTE_JOBID_WILDCARD && - target->vpid == ORTE_VPID_WILDCARD) { - /* see if this target is already present - it will have a wildcard vpid, - * so we have to look for it on that list - */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - target->jobid, - (void**)&route_copy); - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* target already present - update the route info - * in case it has changed - */ - *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, - target->jobid, route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* not already present, so let's add it */ - route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - *route_copy = *route; - rc = opal_hash_table_set_value_uint32(&vpid_wildcard_list, - target->jobid, route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } + opal_output(0, "%s CALL TO UPDATE ROUTE FOR OWN JOB FAMILY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* check for an exact match */ - if (target->jobid != ORTE_JOBID_WILDCARD && - target->vpid != ORTE_VPID_WILDCARD) { - /* see if this route already exists in our table */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(target), - (void**)&route_copy); - - if (ORTE_SUCCESS == rc && NULL != route_copy) { - /* target already present - update the route info - * in case it has changed - */ - *route_copy = *route; - rc = opal_hash_table_set_value_uint64(&peer_list, - orte_util_hash_name(target), route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* not present - add it to the table */ - route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - *route_copy = *route; - rc = opal_hash_table_set_value_uint64(&peer_list, - orte_util_hash_name(target), route_copy); - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - } - return rc; - } - - /* this must be a process that doesn't match any of the - * prior conditions - sorry! - */ return ORTE_ERR_NOT_SUPPORTED; } static orte_process_name_t get_route(orte_process_name_t *target) { - orte_process_name_t *ret; + orte_process_name_t *ret, daemon; int rc; /* if it is me, then the route is just direct */ @@ -427,6 +298,8 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } + /****** HNP AND DAEMONS ONLY ******/ + /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ @@ -438,7 +311,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, + rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&ret); if (ORTE_SUCCESS == rc) { /* got a good result - return it */ @@ -450,28 +323,59 @@ static orte_process_name_t get_route(orte_process_name_t *target) } /* THIS CAME FROM OUR OWN JOB FAMILY... */ - /* check exact matches */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(target), (void**)&ret); - if (ORTE_SUCCESS == rc) { - /* got a good result - return it */ + + /* if this is going to the HNP, send direct */ + if (ORTE_PROC_MY_HNP->jobid == target->jobid && + ORTE_PROC_MY_HNP->vpid == target->vpid) { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing not enabled - going direct", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ret = target; goto found; } - /* didn't find an exact match - check to see if a route for this job was defined */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - target->jobid, (void**)&ret); - if (ORTE_SUCCESS == rc) { - /* got a good result - return it */ + daemon.jobid = ORTE_PROC_MY_NAME->jobid; + /* find out what daemon hosts this proc */ + if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { + /* we don't recognize this one or our nidmap has not yet + * been initialized - if we are the HNP, all we can do is abort + */ + if (orte_process_info.hnp) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ret = ORTE_NAME_INVALID; + goto found; + } + /* if we are not the HNP, send it to the wildcard location */ + ret = &wildcard_route; goto found; } - /* default to wildcard route */ - ret = &wildcard_route; + /* if the daemon is me, then send direct to the target! */ + if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { + ret = target; + } else { + /* the linear routing tree is trivial - if the vpid is + * lower than mine, route through my parent, which is + * at my_vpid-1. If the vpid is higher than mine, then + * route to my_vpid+1, wrapping around to 0 + */ + if (daemon.vpid < ORTE_PROC_MY_NAME->vpid) { + daemon.vpid = ORTE_PROC_MY_NAME->vpid - 1; + ret = &daemon; + } else { + if (ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { + daemon.vpid = ORTE_PROC_MY_NAME->vpid + 1; + } else { + /* we are at end of chain - wrap around */ + daemon.vpid = 0; + } + ret = &daemon; + } + } found: - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), @@ -622,13 +526,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) return rc; } - /* if ndat is NULL, then this is being called during init, - * so just seed the routing table with a path back to the HNP... - */ - if (ORTE_SUCCESS != (rc = update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP))) { - ORTE_ERROR_LOG(rc); - return rc; - } /* set the wildcard route for anybody whose name we don't recognize * to be the HNP */ @@ -889,31 +786,14 @@ static int route_lost(const orte_process_name_t *route) static bool route_is_defined(const orte_process_name_t *target) { - orte_process_name_t *ret; - int rc; - - /* if it is me, then the route is just direct */ - if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) - return true; - - /* check exact matches */ - rc = opal_hash_table_get_value_uint64(&peer_list, - orte_util_hash_name(target), (void**)&ret); - if (ORTE_SUCCESS == rc) { - return true; + /* find out what daemon hosts this proc */ + if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { + return false; } - /* didn't find an exact match - check to see if a route for this job was defined */ - rc = opal_hash_table_get_value_uint32(&vpid_wildcard_list, - target->jobid, (void**)&ret); - if (ORTE_SUCCESS == rc) { - return true; - } - - return false; + + return true; } -/*************************************/ - static int update_routing_tree(void) { diff --git a/orte/mca/routed/direct/Makefile.am b/orte/mca/routed/radix/Makefile.am similarity index 57% rename from orte/mca/routed/direct/Makefile.am rename to orte/mca/routed/radix/Makefile.am index 5b515e019f..ab9acae24f 100644 --- a/orte/mca/routed/direct/Makefile.am +++ b/orte/mca/routed/radix/Makefile.am @@ -9,28 +9,28 @@ # sources = \ - routed_direct.h \ - routed_direct_component.c \ - routed_direct.c + routed_radix.h \ + routed_radix.c \ + routed_radix_component.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if OMPI_BUILD_routed_direct_DSO +if OMPI_BUILD_routed_radix_DSO component_noinst = -component_install = mca_routed_direct.la +component_install = mca_routed_radix.la else -component_noinst = libmca_routed_direct.la +component_noinst = libmca_routed_radix.la component_install = endif mcacomponentdir = $(pkglibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_routed_direct_la_SOURCES = $(sources) -mca_routed_direct_la_LDFLAGS = -module -avoid-version +mca_routed_radix_la_SOURCES = $(sources) +mca_routed_radix_la_LDFLAGS = -module -avoid-version noinst_LTLIBRARIES = $(component_noinst) -libmca_routed_direct_la_SOURCES = $(sources) -libmca_routed_direct_la_LDFLAGS = -module -avoid-version +libmca_routed_radix_la_SOURCES = $(sources) +libmca_routed_radix_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/routed/direct/configure.params b/orte/mca/routed/radix/configure.params similarity index 100% rename from orte/mca/routed/direct/configure.params rename to orte/mca/routed/radix/configure.params diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c new file mode 100644 index 0000000000..02d6cb505d --- /dev/null +++ b/orte/mca/routed/radix/routed_radix.c @@ -0,0 +1,1082 @@ +/* + * Copyright (c) 2007 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "orte/util/show_help.h" +#include "opal/threads/condition.h" +#include "opal/runtime/opal_progress.h" +#include "opal/dss/dss.h" +#include "opal/class/opal_hash_table.h" +#include "opal/class/opal_bitmap.h" +#include "opal/util/bit_ops.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/odls/odls_types.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/runtime.h" + +#include "orte/mca/rml/base/rml_contact.h" + +#include "orte/mca/routed/base/base.h" +#include "routed_radix.h" + + +static int init(void); +static int finalize(void); +static int delete_route(orte_process_name_t *proc); +static int update_route(orte_process_name_t *target, + orte_process_name_t *route); +static orte_process_name_t get_route(orte_process_name_t *target); +static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); +static int route_lost(const orte_process_name_t *route); +static bool route_is_defined(const orte_process_name_t *target); +static int update_routing_tree(void); +static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children); +static bool proc_is_below(orte_vpid_t root, orte_vpid_t target); +static int get_wireup_info(opal_buffer_t *buf); +static int warmup_routes(void); + +#if OPAL_ENABLE_FT == 1 +static int radix_ft_event(int state); +#endif + +orte_routed_module_t orte_routed_radix_module = { + init, + finalize, + delete_route, + update_route, + get_route, + init_routes, + warmup_routes, + route_lost, + route_is_defined, + update_routing_tree, + get_routing_tree, + proc_is_below, + get_wireup_info, +#if OPAL_ENABLE_FT == 1 + radix_ft_event +#else + NULL +#endif +}; + +/* local globals */ +static opal_hash_table_t jobfam_list; +static orte_process_name_t wildcard_route; +static opal_condition_t cond; +static opal_mutex_t lock; +static orte_process_name_t *lifeline=NULL; +static orte_process_name_t my_parent; +static int num_children; +static opal_list_t my_children; +static bool ack_recvd; + + +static int init(void) +{ + OBJ_CONSTRUCT(&jobfam_list, opal_hash_table_t); + opal_hash_table_init(&jobfam_list, 128); + + wildcard_route.jobid = ORTE_NAME_INVALID->jobid; + wildcard_route.vpid = ORTE_NAME_INVALID->vpid; + + /* setup the global condition and lock */ + OBJ_CONSTRUCT(&cond, opal_condition_t); + OBJ_CONSTRUCT(&lock, opal_mutex_t); + + lifeline = NULL; + + /* setup the list of children */ + OBJ_CONSTRUCT(&my_children, opal_list_t); + num_children = 0; + my_parent.jobid = ORTE_PROC_MY_NAME->jobid; + + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + int rc; + opal_list_item_t *item; + + /* if I am an application process, indicate that I am + * truly finalizing prior to departure + */ + if (!orte_process_info.hnp && + !orte_process_info.daemon && + !orte_process_info.tool) { + if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(false))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + /* if I am the HNP, I need to stop the comm recv */ + if (orte_process_info.hnp) { + orte_routed_base_comm_stop(); + } + + OBJ_DESTRUCT(&jobfam_list); + /* destruct the global condition and lock */ + OBJ_DESTRUCT(&cond); + OBJ_DESTRUCT(&lock); + + lifeline = NULL; + + /* deconstruct the list of children */ + while (NULL != (item = opal_list_remove_first(&my_children))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&my_children); + num_children = 0; + + return ORTE_SUCCESS; +} + +static int delete_route(orte_process_name_t *proc) +{ + int rc; + orte_process_name_t *route_copy; + + if (proc->jobid == ORTE_JOBID_INVALID || + proc->vpid == ORTE_VPID_INVALID) { + return ORTE_ERR_BAD_PARAM; + } + + /* if I am an application process, I don't have any routes + * so there is nothing for me to do + */ + if (!orte_process_info.hnp && !orte_process_info.daemon && + !orte_process_info.tool) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_delete_route for %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + + /* if this is from a different job family, then I need to + * look it up appropriately + */ + if (ORTE_JOB_FAMILY(proc->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { + + /* if I am a daemon, then I will automatically route + * anything to this job family via my HNP - so I have nothing + * in my routing table and thus have nothing to do + * here, just return + */ + if (orte_process_info.daemon) { + return ORTE_SUCCESS; + } + + /* see if this proc is present - it will have a wildcard vpid, + * so we have to look for it with that condition + */ + rc = opal_hash_table_get_value_uint32(&jobfam_list, + ORTE_JOB_FAMILY(proc->jobid), + (void**)&route_copy); + if (ORTE_SUCCESS == rc && NULL != route_copy) { + /* proc is present - remove the data */ + free(route_copy); + rc = opal_hash_table_remove_value_uint32(&jobfam_list, + ORTE_JOB_FAMILY(proc->jobid)); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + return rc; + } + + /* not present - nothing to do */ + return ORTE_SUCCESS; + } + + /* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing + * to do here. The routes will be redefined when we update + * the routing tree + */ + + return ORTE_SUCCESS; +} + +static int update_route(orte_process_name_t *target, + orte_process_name_t *route) +{ + int rc; + orte_process_name_t *route_copy; + + if (target->jobid == ORTE_JOBID_INVALID || + target->vpid == ORTE_VPID_INVALID) { + return ORTE_ERR_BAD_PARAM; + } + + /* if I am an application process, we don't update the route since + * we automatically route everything through the local daemon + */ + if (!orte_process_info.hnp && !orte_process_info.daemon && + !orte_process_info.tool) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_update: %s --> %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(target), + ORTE_NAME_PRINT(route))); + + + /* if this is from a different job family, then I need to + * track how to send messages to it + */ + if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { + + /* if I am a daemon, then I will automatically route + * anything to this job family via my HNP - so nothing to do + * here, just return + */ + if (orte_process_info.daemon) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_update: diff job family routing job %s --> %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(target->jobid), + ORTE_NAME_PRINT(route))); + + /* see if this target is already present - it will have a wildcard vpid, + * so we have to look for it with that condition + */ + rc = opal_hash_table_get_value_uint32(&jobfam_list, + ORTE_JOB_FAMILY(target->jobid), + (void**)&route_copy); + if (ORTE_SUCCESS == rc && NULL != route_copy) { + /* target already present - update the route info + * in case it has changed + */ + *route_copy = *route; + rc = opal_hash_table_set_value_uint32(&jobfam_list, + ORTE_JOB_FAMILY(target->jobid), route_copy); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + return rc; + } + + /* not there, so add the route FOR THE JOB FAMILY*/ + route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + *route_copy = *route; + rc = opal_hash_table_set_value_uint32(&jobfam_list, + ORTE_JOB_FAMILY(target->jobid), route_copy); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + return rc; + } + + /* THIS CAME FROM OUR OWN JOB FAMILY... */ + + opal_output(0, "%s CALL TO UPDATE ROUTE FOR OWN JOB FAMILY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + return ORTE_ERR_NOT_SUPPORTED; +} + + +static orte_process_name_t get_route(orte_process_name_t *target) +{ + orte_process_name_t *ret, daemon; + opal_list_item_t *item; + orte_routed_tree_t *child; + int rc; + + if (target->jobid == ORTE_JOBID_INVALID || + target->vpid == ORTE_VPID_INVALID) { + ret = ORTE_NAME_INVALID; + goto found; + } + + /* if it is me, then the route is just direct */ + if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { + ret = target; + goto found; + } + + /* if I am an application process, always route via my local daemon */ + if (!orte_process_info.hnp && !orte_process_info.daemon && + !orte_process_info.tool) { + ret = ORTE_PROC_MY_DAEMON; + goto found; + } + + /****** HNP AND DAEMONS ONLY ******/ + + /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ + if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { + /* if I am a daemon, route this via the HNP */ + if (orte_process_info.daemon) { + ret = ORTE_PROC_MY_HNP; + goto found; + } + + /* if I am the HNP or a tool, then I stored a route to + * this job family, so look it up + */ + rc = opal_hash_table_get_value_uint32(&jobfam_list, + ORTE_JOB_FAMILY(target->jobid), (void**)&ret); + if (ORTE_SUCCESS == rc) { + /* got a good result - return it */ + goto found; + } + /* not found - so we have no route */ + ret = ORTE_NAME_INVALID; + goto found; + } + + /* THIS CAME FROM OUR OWN JOB FAMILY... */ + + /* if this is going to the HNP, send direct */ + if (ORTE_PROC_MY_HNP->jobid == target->jobid && + ORTE_PROC_MY_HNP->vpid == target->vpid) { + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routing not enabled - going direct", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ret = target; + goto found; + } + + daemon.jobid = ORTE_PROC_MY_NAME->jobid; + /* find out what daemon hosts this proc */ + if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { + /* we don't recognize this one or our nidmap has not yet + * been initialized - if we are the HNP, all we can do is abort + */ + if (orte_process_info.hnp) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ret = ORTE_NAME_INVALID; + goto found; + } + /* if we are not the HNP, send it to the wildcard location */ + ret = &wildcard_route; + goto found; + } + + /* if the daemon is me, then send direct to the target! */ + if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { + ret = target; + goto found; + } else { + /* search routing tree for next step to that daemon */ + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + if (child->vpid == daemon.vpid) { + /* the child is hosting the proc - just send it there */ + ret = &daemon; + goto found; + } + /* otherwise, see if the daemon we need is below the child */ + if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { + /* yep - we need to step through this child */ + daemon.vpid = child->vpid; + ret = &daemon; + goto found; + } + } + } + + /* if we get here, then the target daemon is not beneath + * any of our children, so we have to step up through our parent + */ + daemon.vpid = my_parent.vpid; + ret = &daemon; + +found: + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_get(%s) --> %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(target), + ORTE_NAME_PRINT(ret))); + + return *ret; +} + +static int process_callback(orte_jobid_t job, opal_buffer_t *buffer) +{ + orte_proc_t **procs; + orte_job_t *jdata; + orte_std_cntr_t cnt; + char *rml_uri; + orte_process_name_t name; + int rc; + + /* lookup the job object for this process */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + procs = (orte_proc_t**)jdata->procs->addr; + + /* unpack the data for each entry */ + cnt = 1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) { + + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed_radix:callback got uri %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == rml_uri) ? "NULL" : rml_uri)); + + if (rml_uri == NULL) continue; + + /* we don't need to set the contact info into our rml + * hash table as we won't talk to the proc directly + */ + + /* extract the proc's name */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) { + ORTE_ERROR_LOG(rc); + free(rml_uri); + continue; + } + /* the procs are stored in vpid order, so update the record */ + procs[name.vpid]->rml_uri = strdup(rml_uri); + free(rml_uri); + + /* update the proc state */ + if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) { + procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING; + } + + ++jdata->num_reported; + cnt = 1; + } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* if all procs have reported, update our job state */ + if (jdata->num_reported == jdata->num_procs) { + /* update the job state */ + if (jdata->state < ORTE_JOB_STATE_RUNNING) { + jdata->state = ORTE_JOB_STATE_RUNNING; + } + } + + return ORTE_SUCCESS; +} + +/* HANDLE ACK MESSAGES FROM AN HNP */ +static void release_ack(int fd, short event, void *data) +{ + orte_message_event_t *mev = (orte_message_event_t*)data; + ack_recvd = true; + OBJ_RELEASE(mev); +} + +static void recv_ack(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + /* don't process this right away - we need to get out of the recv before + * we process the message as it may ask us to do something that involves + * more messaging! Instead, setup an event so that the message gets processed + * as soon as we leave the recv. + * + * The macro makes a copy of the buffer, which we release above - the incoming + * buffer, however, is NOT released here, although its payload IS transferred + * to the message buffer for later processing + */ + ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); +} + + +static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) +{ + /* the radix module routes all proc communications through + * the local daemon. Daemons must identify which of their + * daemon-peers is "hosting" the specified recipient and + * route the message to that daemon. Daemon contact info + * is handled elsewhere, so all we need to do here is + * ensure that the procs are told to route through their + * local daemon, and that daemons are told how to route + * for each proc + */ + int rc; + + /* if I am a tool, then I stand alone - there is nothing to do */ + if (orte_process_info.tool) { + return ORTE_SUCCESS; + } + + /* if I am a daemon or HNP, then I have to extract the routing info for this job + * from the data sent to me for launch and update the routing tables to + * point at the daemon for each proc + */ + if (orte_process_info.daemon) { + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix: init routes for daemon job %s\n\thnp_uri %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); + + if (NULL == ndat) { + /* indicates this is being called during orte_init. + * Get the HNP's name for possible later use + */ + if (NULL == orte_process_info.my_hnp_uri) { + /* fatal error */ + ORTE_ERROR_LOG(ORTE_ERR_FATAL); + return ORTE_ERR_FATAL; + } + /* set the contact info into the hash table */ + if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) { + ORTE_ERROR_LOG(rc); + return(rc); + } + + /* extract the hnp name and store it */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, + ORTE_PROC_MY_HNP, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* set the wildcard route for anybody whose name we don't recognize + * to be the HNP + */ + wildcard_route.jobid = ORTE_PROC_MY_HNP->jobid; + wildcard_route.vpid = ORTE_PROC_MY_HNP->vpid; + + /* set our lifeline to the HNP - we will abort if that connection is lost */ + lifeline = ORTE_PROC_MY_HNP; + + /* daemons will send their contact info back to the HNP as + * part of the message confirming they are read to go. HNP's + * load their contact info during orte_init + */ + } else { + /* ndat != NULL means we are getting an update of RML info + * for the daemons - so update our contact info and routes + */ + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { + ORTE_ERROR_LOG(rc); + } + return rc; + } + + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed_radix: completed init routes", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + return ORTE_SUCCESS; + } + + + if (orte_process_info.hnp) { + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix: init routes for HNP job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job))); + + if (NULL == ndat) { + /* if ndat is NULL, then this is being called during init, so just + * make myself available to catch any reported contact info + */ + if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* the HNP has no lifeline */ + lifeline = NULL; + } else { + /* if this is for my own jobid, then I am getting an update of RML info + * for the daemons - so update our contact info and routes + */ + if (ORTE_PROC_MY_NAME->jobid == job) { + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } else { + /* if not, then I need to process the callback */ + if (ORTE_SUCCESS != (rc = process_callback(job, ndat))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + + return ORTE_SUCCESS; + } + + { /* MUST BE A PROC */ + /* if ndat != NULL, then this is being invoked by the proc to + * init a route to a specified process that is outside of our + * job family. We want that route to go through our HNP, routed via + * out local daemon - however, we cannot know for + * certain that the HNP already knows how to talk to the specified + * procs. For example, in OMPI's publish/subscribe procedures, the + * DPM framework looks for an mca param containing the global ompi-server's + * uri. This info will come here so the proc can setup a route to + * the server - we need to pass the routing info to our HNP + */ + if (NULL != ndat) { + int rc; + +#if 0 + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix: init routes w/non-NULL data", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* if this is for my job family, then we send the buffer + * to the proper tag on the daemon + */ + if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(job)) { + /* send the buffer to the proper tag on the daemon */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, ndat, + ORTE_RML_TAG_RML_INFO_UPDATE, 0))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* wait right here until the daemon acks the update to ensure that + * any subsequent messaging can succeed + */ + ack_recvd = false; + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, + ORTE_RML_NON_PERSISTENT, recv_ack, NULL); + + ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_init_routes: ack recvd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* we already have defined our routes to everyone to + * be through the local daemon, so nothing further to do + */ + return ORTE_SUCCESS; + } +#endif + /* if this is for a different job family, then we route via our HNP + * to minimize connection counts to entities such as ompi-server, so + * start by sending the contact info to the HNP for update + */ + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_init_routes: diff job family - sending update to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); + + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, ndat, + ORTE_RML_TAG_RML_INFO_UPDATE, 0))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* wait right here until the HNP acks the update to ensure that + * any subsequent messaging can succeed + */ + ack_recvd = false; + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, + ORTE_RML_NON_PERSISTENT, recv_ack, NULL); + + ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix_init_routes: ack recvd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* our get_route function automatically routes all messages for + * other job families via the HNP, so nothing more to do here + */ + return ORTE_SUCCESS; + } + + /* if ndat=NULL, then we are being called during orte_init. In this + * case, we need to setup a few critical pieces of info + */ + + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_radix: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), + (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, + (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); + + if (NULL == orte_process_info.my_daemon_uri) { + /* in this module, we absolutely MUST have this information - if + * we didn't get it, then error out + */ + opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + opal_output(0, "%s ERROR: This is a fatal condition when the radix router", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + opal_output(0, "%s ERROR: has been selected - either select the unity router", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return ORTE_ERR_FATAL; + } + + /* we have to set the HNP's name, even though we won't route messages directly + * to it. This is required to ensure that we -do- send messages to the correct + * HNP name + */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, + ORTE_PROC_MY_HNP, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* Set the contact info in the RML - this won't actually establish + * the connection, but just tells the RML how to reach the daemon + * if/when we attempt to send to it + */ + if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_daemon_uri))) { + ORTE_ERROR_LOG(rc); + return(rc); + } + /* extract the daemon's name so we can update the routing table */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, + ORTE_PROC_MY_DAEMON, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* setup the route to all other procs to flow through the daemon */ + wildcard_route.jobid = ORTE_PROC_MY_DAEMON->jobid; + wildcard_route.vpid = ORTE_PROC_MY_DAEMON->vpid; + + /* set our lifeline to the local daemon - we will abort if this connection is lost */ + lifeline = ORTE_PROC_MY_DAEMON; + + /* register ourselves -this sends a message to the daemon (warming up that connection) + * and sends our contact info to the HNP when all local procs have reported + * + * NOTE: it may seem odd that we send our contact info to the HNP - after all, + * the HNP doesn't really need to know how to talk to us directly if we are + * using this routing method. However, this is good for two reasons: + * + * (1) some debuggers and/or tools may need RML contact + * info to set themselves up + * + * (2) doing so allows the HNP to "block" in a dynamic launch + * until all procs are reported running, thus ensuring that no communication + * is attempted until the overall ORTE system knows how to talk to everyone - + * otherwise, the system can just hang. + */ + if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* no answer is expected or coming */ + + return ORTE_SUCCESS; + } +} + +static int warmup_routes(void) +{ + opal_buffer_t buf; + orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_NULL_CMD; + int rc; + + /* send a NULL command to my parent */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD); + if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON, 0))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buf); + return rc; + } + OBJ_DESTRUCT(&buf); + return ORTE_SUCCESS; +} + +static int route_lost(const orte_process_name_t *route) +{ + /* if we lose the connection to the lifeline and we are NOT already, + * in finalize, tell the OOB to abort. + * NOTE: we cannot call abort from here as the OOB needs to first + * release a thread-lock - otherwise, we will hang!! + */ + if (!orte_finalizing && + NULL != lifeline && + OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { + opal_output(0, "%s routed:radix: Connection to lifeline %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(lifeline)); + return ORTE_ERR_FATAL; + } + + /* we don't care about this one, so return success */ + return ORTE_SUCCESS; +} + +static bool route_is_defined(const orte_process_name_t *target) +{ + /* find out what daemon hosts this proc */ + if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { + return false; + } + + return true; +} + +static void radix_tree(int rank, int *num_children, + opal_list_t *children, opal_bitmap_t *relatives) +{ + int i, peer, Sum, NInLevel, rc; + orte_routed_tree_t *child; + opal_bitmap_t *relations; + + /* compute how many procs are at my level */ + Sum=1; + NInLevel=1; + + while ( Sum < (rank+1) ) { + NInLevel *= mca_routed_radix_component.radix; + Sum += NInLevel; + } + + /* our children start at our rank + num_in_level */ + peer = rank + NInLevel; + for (i = 0; i < mca_routed_radix_component.radix; i++) { + if (peer < (int)orte_process_info.num_procs) { + child = OBJ_NEW(orte_routed_tree_t); + child->vpid = peer; + if (NULL != children) { + /* this is a direct child - add it to my list */ + opal_list_append(children, &child->super); + (*num_children)++; + /* setup the relatives bitmap */ + opal_bitmap_init(&child->relatives, orte_process_info.num_procs); + /* point to the relatives */ + relations = &child->relatives; + } else { + /* we are recording someone's relatives - set the bit */ + if (OPAL_SUCCESS != (rc = opal_bitmap_set_bit(relatives, peer))) { + opal_output(0, "%s Error: could not set relations bit!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + } + /* point to this relations */ + relations = relatives; + } + /* search for this child's relatives */ + radix_tree(peer, NULL, NULL, relations); + } + peer += NInLevel; + } +} + +static int update_routing_tree(void) +{ + orte_routed_tree_t *child; + int j; + opal_list_item_t *item; + int Level,Sum,NInLevel,Ii; + int NInPrevLevel; + + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + + /* clear the list of children if any are already present */ + while (NULL != (item = opal_list_remove_first(&my_children))) { + OBJ_RELEASE(item); + } + num_children = 0; + + /* compute my parent */ + Ii = ORTE_PROC_MY_NAME->vpid; + Level=0; + Sum=1; + NInLevel=1; + + while ( Sum < (Ii+1) ) { + Level++; + NInLevel *= mca_routed_radix_component.radix; + Sum += NInLevel; + } + Sum -= NInLevel; + + NInPrevLevel = NInLevel/mca_routed_radix_component.radix; + + if( 0 == Ii ) { + my_parent.vpid = -1; + } else { + my_parent.vpid = (Ii-Sum) % NInPrevLevel; + my_parent.vpid += (Sum - NInPrevLevel); + } + + /* compute my direct children and the bitmap that shows which vpids + * lie underneath their branch + */ + radix_tree(Ii, &num_children, &my_children, NULL); + + if (0 < opal_output_get_verbosity(orte_routed_base_output)) { + opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), my_parent.vpid, num_children); + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid); + for (j=0; j < (int)orte_process_info.num_procs; j++) { + if (opal_bitmap_is_set_bit(&child->relatives, j)) { + opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); + } + } + } + } + + return ORTE_SUCCESS; +} + +static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children) +{ + opal_list_item_t *item; + orte_namelist_t *nm; + orte_routed_tree_t *child; + + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_VPID_INVALID; + } + + /* the radix routing tree always goes to our children, + * for any job + */ + if (NULL != children) { + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = child->vpid; + opal_list_append(children, &nm->item); + } + } + /* return my parent's vpid */ + return my_parent.vpid; +} + +static int get_wireup_info(opal_buffer_t *buf) +{ + int rc; + + /* if I am anything other than the HNP, this + * is a meaningless command as I cannot get + * the requested info + */ + if (!orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + + /* if we are not using static ports, then we need to share the + * comm info - otherwise, just return + */ + if (orte_static_ports) { + return ORTE_SUCCESS; + } + + if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return rc; + } + + return ORTE_SUCCESS; +} + +static bool proc_is_below(orte_vpid_t root, orte_vpid_t target) +{ + opal_list_item_t *item; + orte_routed_tree_t *child; + + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return false; + } + + /* quick check: if root == target, then the answer is always true! */ + if (root == target) { + return true; + } + + /* check the list of children to see if either their vpid + * matches target, or the target bit is set in their bitmap + */ + + /* first find the specified child */ + for (item = opal_list_get_first(&my_children); + item != opal_list_get_end(&my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + if (child->vpid == root) { + /* now see if the target lies below this child */ + return opal_bitmap_is_set_bit(&child->relatives, target); + } + } + + /* only get here if we have no children or we didn't find anything */ + return false; +} + + +#if OPAL_ENABLE_FT == 1 +static int radix_ft_event(int state) +{ + int ret, exit_status = ORTE_SUCCESS; + + /******** Checkpoint Prep ********/ + if(OPAL_CRS_CHECKPOINT == state) { + } + /******** Continue Recovery ********/ + else if (OPAL_CRS_CONTINUE == state ) { + } + /******** Restart Recovery ********/ + else if (OPAL_CRS_RESTART == state ) { + /* + * Re-exchange the routes + */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { + exit_status = ret; + goto cleanup; + } + } + else if (OPAL_CRS_TERM == state ) { + /* Nothing */ + } + else { + /* Error state = Nothing */ + } + + cleanup: + return exit_status; +} +#endif + diff --git a/orte/mca/routed/direct/routed_direct.h b/orte/mca/routed/radix/routed_radix.h similarity index 51% rename from orte/mca/routed/direct/routed_direct.h rename to orte/mca/routed/radix/routed_radix.h index 3c0ba92cd3..8393116e67 100644 --- a/orte/mca/routed/direct/routed_direct.h +++ b/orte/mca/routed/radix/routed_radix.h @@ -8,8 +8,8 @@ * $HEADER$ */ -#ifndef MCA_ROUTED_DIRECT_H -#define MCA_ROUTED_DIRECT_H +#ifndef MCA_ROUTED_RADIX_H +#define MCA_ROUTED_RADIX_H #include "orte_config.h" #include "orte/types.h" @@ -18,11 +18,13 @@ BEGIN_C_DECLS +typedef struct { + orte_routed_component_t super; + int radix; +} orte_routed_radix_component_t; +ORTE_MODULE_DECLSPEC extern orte_routed_radix_component_t mca_routed_radix_component; -ORTE_MODULE_DECLSPEC extern orte_routed_component_t mca_routed_direct_component; - -extern orte_routed_module_t orte_routed_direct_module; - +extern orte_routed_module_t orte_routed_radix_module; END_C_DECLS diff --git a/orte/mca/routed/radix/routed_radix_component.c b/orte/mca/routed/radix/routed_radix_component.c new file mode 100644 index 0000000000..04c0ff6776 --- /dev/null +++ b/orte/mca/routed/radix/routed_radix_component.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2007 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2004-2008 The Trustees of Indiana University. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include "orte/util/show_help.h" +#include "opal/class/opal_hash_table.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/routed/base/base.h" +#include "routed_radix.h" + +static int orte_routed_radix_component_query(mca_base_module_t **module, int *priority); + +/** + * component definition + */ +orte_routed_radix_component_t mca_routed_radix_component = { + { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + ORTE_ROUTED_BASE_VERSION_2_0_0, + + "radix", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + NULL, + NULL, + orte_routed_radix_component_query + }, + { + /* This component can be checkpointed */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + } +}; + +static int orte_routed_radix_component_query(mca_base_module_t **module, int *priority) +{ + int tmp; + mca_base_component_t *c = &mca_routed_radix_component.super.base_version; + + mca_base_param_reg_int(c, NULL, + "Radix to be used for routed radix tree", + false, false, -1, &tmp); + if (0 < tmp) { + mca_routed_radix_component.radix = tmp; + *priority = 150; + *module = (mca_base_module_t *) &orte_routed_radix_module; + return ORTE_SUCCESS; + } + + /* if radix not provided, then we can't run */ + *priority = 0; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index 28fc685ac4..37adcb09b4 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -80,6 +80,9 @@ /* * Globals */ +static bool relay_is_required; +static bool exit_after_relay; + static int process_commands(orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag); @@ -254,6 +257,10 @@ CLEANUP: OBJ_RELEASE(buffer); } OBJ_RELEASE(mev); + /* see if we need to exit */ + if (exit_after_relay) { + orte_trigger_event(&orte_exit); + } } void orte_daemon_recv(int status, orte_process_name_t* sender, @@ -380,9 +387,13 @@ void orte_daemon_cmd_processor(int fd, short event, void *data) buffer->unpack_ptr = unpack_ptr; /* setup an event to actually perform the relay */ ORTE_MESSAGE_EVENT(&target, buffer, target_tag, send_relay); + /* flag that a relay is required */ + relay_is_required = true; /* rewind the buffer to the right place for processing */ buffer->unpack_ptr = save; } else { + /* flag that a relay is -not- required */ + relay_is_required = false; /* rewind the buffer so we can process it correctly */ buffer->unpack_ptr = unpack_ptr; } @@ -605,7 +616,16 @@ static int process_commands(orte_process_name_t* sender, break; /**** EXIT COMMAND ****/ - case ORTE_DAEMON_EXIT_CMD: + case ORTE_DAEMON_EXIT_WITH_REPLY_CMD: + /* disable routing - we need to do this + * because daemons exit in an uncoordinated fashion. + * Thus, our routes are being dismantled, so we can't + * trust that any given route still exists + */ + orte_routing_is_enabled = false; + /* if we are the HNP, kill our local procs and + * flag we are exited - but don't yet exit + */ if (orte_process_info.hnp) { orte_job_t *daemons; orte_proc_t **procs; @@ -627,14 +647,67 @@ static int process_commands(orte_process_name_t* sender, /* all done! */ return ORTE_SUCCESS; } - /* eventually, we need to revise this so we only - * exit if all our children are dead. For now, treat - * the same as a "hard kill" command + /* if we are not the HNP, send a message to the HNP telling + * it we are leaving - and then trigger our exit */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received exit", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } + /* send a state update so the HNP knows we are "gone" */ + { + opal_buffer_t ack; + orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED; + orte_exit_code_t exit_code=0; + orte_plm_cmd_flag_t cmd = ORTE_PLM_UPDATE_PROC_STATE; + + OBJ_CONSTRUCT(&ack, opal_buffer_t); + opal_dss.pack(&ack, &cmd, 1, ORTE_PLM_CMD); + opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID); + opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID); + opal_dss.pack(&ack, &state, 1, ORTE_PROC_STATE); + opal_dss.pack(&ack, &exit_code, 1, ORTE_EXIT_CODE); + orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0); + OBJ_DESTRUCT(&ack); + } + /* trigger our appropriate exit procedure + * NOTE: this event will fire -after- any zero-time events + * so any pending relays -do- get sent first + */ + if (relay_is_required) { + exit_after_relay = true; + } else { + orte_trigger_event(&orte_exit); + } + return ORTE_SUCCESS; + break; + + /**** EXIT_NO_REPLY COMMAND ****/ + case ORTE_DAEMON_EXIT_NO_REPLY_CMD: + /* disable routing - we need to do this + * because daemons exit in an uncoordinated fashion. + * Thus, our routes are being dismantled, so we can't + * trust that any given route still exists + */ + orte_routing_is_enabled = false; + /* if we are the HNP, kill our local procs and + * flag we are exited - but don't yet exit + */ + if (orte_process_info.hnp) { + /* if we are the HNP, ensure our local procs are terminated */ + orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false); + /* There is nothing more to do here - actual exit will be + * accomplished by the plm + */ + return ORTE_SUCCESS; + } + /* if we are not the HNP, don't send any messages - just + * trigger our exit + */ + if (orte_debug_daemons_flag) { + opal_output(0, "%s orted_cmd: received exit_no_reply", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + } /* trigger our appropriate exit procedure * NOTE: this event will fire -after- any zero-time events * so any pending relays -do- get sent first @@ -642,7 +715,7 @@ static int process_commands(orte_process_name_t* sender, orte_trigger_event(&orte_exit); return ORTE_SUCCESS; break; - + /**** HALT VM COMMAND ****/ case ORTE_DAEMON_HALT_VM_CMD: if (orte_debug_daemons_flag) { diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 7af163942e..24551c2dfd 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -63,7 +63,6 @@ bool orte_devel_level_output = false; int32_t orte_contiguous_nodes; char **orte_launch_environ; -opal_pointer_array_t orte_daemonmap; bool orte_hnp_is_allocated = false; bool orte_allocation_required; @@ -78,7 +77,7 @@ int orte_debugger_check_rate; orte_trigger_event_t orte_exit, orteds_exit; int orte_exit_status = 0; bool orte_abnormal_term_ordered = false; -bool orte_shutdown_in_progress = false; +bool orte_routing_is_enabled = false; int orte_heartbeat_rate; int orte_startup_timeout; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 0b91b749f4..27c4c3b7a0 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -371,6 +371,8 @@ typedef struct { opal_object_t super; /* jobid */ orte_jobid_t job; + /* number of procs in this job */ + orte_vpid_t num_procs; /* array of data for procs */ opal_value_array_t pmap; } orte_jmap_t; @@ -387,7 +389,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_jmap_t); ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job); /* global variables used by RTE - instanced in orte_globals.c */ -ORTE_DECLSPEC extern bool orte_reuse_daemons, orte_timing; +ORTE_DECLSPEC extern bool orte_timing; ORTE_DECLSPEC extern bool orte_debug_daemons_flag, orte_debug_daemons_file_flag; ORTE_DECLSPEC extern bool orte_leave_session_attached; ORTE_DECLSPEC extern bool orte_do_not_launch; @@ -406,7 +408,6 @@ ORTE_DECLSPEC extern bool orte_never_launched; ORTE_DECLSPEC extern bool orte_devel_level_output; ORTE_DECLSPEC extern char **orte_launch_environ; -ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap; ORTE_DECLSPEC extern bool orte_hnp_is_allocated; ORTE_DECLSPEC extern bool orte_allocation_required; @@ -422,7 +423,7 @@ ORTE_DECLSPEC extern int orte_debugger_check_rate; ORTE_DECLSPEC extern orte_trigger_event_t orte_exit, orteds_exit; ORTE_DECLSPEC extern int orte_exit_status; ORTE_DECLSPEC extern bool orte_abnormal_term_ordered; -ORTE_DECLSPEC extern bool orte_shutdown_in_progress; +ORTE_DECLSPEC extern bool orte_routing_is_enabled; ORTE_DECLSPEC extern int orte_heartbeat_rate; ORTE_DECLSPEC extern int orte_startup_timeout; diff --git a/orte/test/system/Makefile b/orte/test/system/Makefile index 402cdb5ef1..d1fcb3aef4 100644 --- a/orte/test/system/Makefile +++ b/orte/test/system/Makefile @@ -1,4 +1,4 @@ -PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay +PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix all: $(PROGS) diff --git a/orte/test/system/radix.c b/orte/test/system/radix.c new file mode 100644 index 0000000000..f0012a89b8 --- /dev/null +++ b/orte/test/system/radix.c @@ -0,0 +1,139 @@ +/* -*- C -*- + * + * $HEADER$ + * + * The most basic of MPI applications + */ + +#include "orte_config.h" + +#include +#include + + + +#include "opal/util/bit_ops.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_bitmap.h" + +#include "orte/mca/routed/base/base.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/runtime.h" + +static int Radix; + +int down_search(int me, int num_procs, + int *num_children, opal_list_t *children, opal_bitmap_t *relatives) +{ + int i, peer, Sum, NInLevel, rc; + orte_routed_tree_t *child; + opal_bitmap_t *relations; + + /* compute how many procs are at my level */ + Sum=1; + NInLevel=1; + + while ( Sum < (me+1) ) { + NInLevel *= Radix; + Sum += NInLevel; + } + /* printf("\trank %d inlevel %d\n", me, NInLevel); */ + + /* our children start at our rank + num_in_level */ + peer = me + NInLevel; + for (i = 0; i < Radix; i++) { + if (peer < num_procs) { + child = OBJ_NEW(orte_routed_tree_t); + child->vpid = peer; + if (NULL != children) { + /* printf("\t\tadding child rank %d\n", peer); */ + /* this is a direct child - add it to my list */ + opal_list_append(children, &child->super); + (*num_children)++; + /* setup the relatives bitmap */ + opal_bitmap_init(&child->relatives, num_procs); + /* point to the relatives */ + relations = &child->relatives; + } else { + /* printf("\t\tsetting bit for rank %d\n", peer); */ + /* we are recording someone's relatives - set the bit */ + if (OPAL_SUCCESS != (rc = opal_bitmap_set_bit(relatives, peer))) { + printf("\t\t\tbit not set!\n"); + } + /* point to this relations */ + relations = relatives; + } + /* printf("\tdownsearching peer %d\n", peer); */ + /* search for this child's relatives */ + down_search(peer, num_procs, NULL, NULL, relations); + } + peer += NInLevel; + } +} + +main(int argc, char **argv) +{ + opal_list_t children; + opal_list_item_t *item; + int num_children; + orte_routed_tree_t *child; + int j; + int NProcs; + int Level,Sum,NInLevel,Ii; + int Parent,NInPrevLevel; + + + if (3 != argc) { + printf("usage: radix r x, where r=radix and x=number of procs\n"); + exit(1); + } + + orte_init(ORTE_TOOL); + + Radix = atoi(argv[1]); + NProcs = atoi(argv[2]); + + for(Ii = 0 ; Ii < NProcs ; Ii++) { + OBJ_CONSTRUCT(&children, opal_list_t); + num_children = 0; + Level=0; + Sum=1; + NInLevel=1; + + while ( Sum < (Ii+1) ) { + Level++; + NInLevel*=Radix; + Sum+=NInLevel; + } + Sum-=NInLevel; + + NInPrevLevel=NInLevel/Radix; + + if( 0 == Ii ) { + Parent=-1; + } else { + Parent=(Ii-Sum) % NInPrevLevel; + Parent+=(Sum - NInPrevLevel); + } + + fprintf(stderr," I am %d: Parent %d\n", + Ii,Parent); + + /* compute children and relatives */ + down_search(Ii, NProcs, &num_children, &children, NULL); + while (NULL != (item = opal_list_remove_first(&children))) { + child = (orte_routed_tree_t*)item; + fprintf(stderr, "\tchild %d\n", child->vpid); + for (j=0; j < NProcs; j++) { + if (opal_bitmap_is_set_bit(&child->relatives, j)) { + fprintf(stderr, "\t\trelation %d\n", j); + } + } + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&children); + } + + orte_finalize(); +} diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index e587c8ab47..12d472ad56 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -589,7 +589,6 @@ int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr) int32_t *nodes; orte_proc_t **procs; orte_vpid_t i; - int8_t *tmp, flag; opal_buffer_t buf; orte_local_rank_t *lrank; orte_node_rank_t *nrank; @@ -644,38 +643,6 @@ int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr) } free(nrank); - /* transfer and pack the app_idx in one pack */ - tmp = (int8_t*)malloc(jdata->num_procs); - for (i=0; i < jdata->num_procs; i++) { - tmp[i] = procs[i]->app_idx; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, tmp, jdata->num_procs, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - free(tmp); - - /* are there cpu_list strings? */ - if (jdata->map->cpu_lists) { - flag = (int)true; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - for (i=0; i < jdata->num_procs; i++) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &procs[i]->slot_list, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } else { - flag = (int)false; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &flag, 1, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - /* transfer the payload to the byte object */ opal_dss.unload(&buf, (void**)&boptr->bytes, &boptr->size); OBJ_DESTRUCT(&buf); @@ -685,17 +652,13 @@ int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr) int orte_util_decode_pidmap(opal_byte_object_t *bo, orte_vpid_t *nprocs, - opal_value_array_t *procs, int8_t **app_idx, - char ***slot_str) + opal_value_array_t *procs) { orte_vpid_t i, num_procs; orte_pmap_t pmap; int32_t *nodes; orte_local_rank_t *local_rank; orte_node_rank_t *node_rank; - int8_t *idx; - int8_t flag; - char **slots; orte_std_cntr_t n; opal_buffer_t buf; int rc; @@ -759,46 +722,6 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo, orte_vpid_t *nprocs, free(local_rank); free(node_rank); - /* only daemons/HNPs need the rest of the data, so if - * we aren't one of those, we are done! - */ - if (!orte_process_info.hnp && - !orte_process_info.daemon) { - OBJ_DESTRUCT(&buf); - return ORTE_SUCCESS; - } - - /* allocate memory for app_idx */ - idx = (int8_t*)malloc(num_procs); - /* unpack app_idx in one shot */ - n=num_procs; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, idx, &n, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* hand the array back to the caller */ - *app_idx = idx; - - /* unpack flag to indicate if slot_strings are present */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &flag, &n, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - if (flag) { - /* allocate space */ - slots = (char**)malloc(num_procs * sizeof(char*)); - for (i=0; i < num_procs; i++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &slots[i], &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - *slot_str = slots; - } - OBJ_DESTRUCT(&buf); return ORTE_SUCCESS; } diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index ef18e6ad7a..5277cb36bf 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -49,8 +49,7 @@ ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr, opal_point ORTE_DECLSPEC int orte_util_encode_pidmap(orte_job_t *jdata, opal_byte_object_t *boptr); ORTE_DECLSPEC int orte_util_decode_pidmap(opal_byte_object_t *boptr, orte_vpid_t *num_procs, - opal_value_array_t *procs, int8_t **app_idx, - char ***slot_str); + opal_value_array_t *procs); END_C_DECLS