diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 4202cbfefe..50ff1c6219 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -24,6 +24,7 @@ #include "orte/util/show_help.h" #include "opal/mca/paffinity/paffinity.h" +#include "opal/util/argv.h" #include "orte/util/proc_info.h" #include "orte/mca/errmgr/base/base.h" @@ -65,11 +66,18 @@ orte_ess_base_module_t orte_ess_alps_module = { NULL /* ft_event */ }; +/* + * Local variables + */ +static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID; + static int rte_init(void) { int ret; char *error = NULL; + char **hosts = NULL; + char *nodelist; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -84,12 +92,23 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { + /* get the list of nodes used for this job */ + nodelist = getenv("OMPI_MCA_orte_nodelist"); + + if (NULL != nodelist) { + /* split the node list into an argv array */ + hosts = opal_argv_split(nodelist, ','); + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - } else if (ORTE_PROC_IS_TOOL) { + opal_argv_free(hosts); + return ORTE_SUCCESS; + } + + if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); @@ -98,15 +117,15 @@ static int rte_init(void) } /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; - } else { - /* otherwise, I must be an application process - use - * the default procedure to finish my setup - */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } + } + + /* otherwise, I must be an application process - use + * the default procedure to finish my setup + */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_app_setup"; + goto error; } /* setup the nidmap arrays */ @@ -279,6 +298,17 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) { orte_pmap_t *pmap; + /* is this me? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid && + proc->vpid == ORTE_PROC_MY_NAME->vpid) { + /* yes it is - reply with my rank. This is necessary + * because the pidmap will not have arrived when I + * am starting up, and if we use static ports, then + * I need to know my node rank during init + */ + return my_node_rank; + } + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_NODE_RANK_INVALID; @@ -318,36 +348,36 @@ static int update_nidmap(opal_byte_object_t *bo) static int alps_set_name(void) { int rc; - int id; orte_jobid_t jobid; orte_vpid_t starting_vpid; - char* jobid_string; - char* vpid_string; + char* tmp; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps setting name")); - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_string); - if (NULL == jobid_string) { + mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_string))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } + free(tmp); - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &vpid_string); - if (NULL == vpid_string) { + mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid, vpid_string))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } + free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; @@ -355,6 +385,15 @@ static int alps_set_name(void) OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* get my node rank in case we are using static ports - this won't + * be present for daemons, so don't error out if we don't have it + */ + mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank", + true, false, NULL, &tmp); + if (NULL != tmp) { + my_node_rank = strtol(tmp, NULL, 10); + } + orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size(); return ORTE_SUCCESS; diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index 52f100ce8e..b5552953f7 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -74,7 +74,7 @@ ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report) __opal_attri ORTE_DECLSPEC int orte_ess_base_tool_setup(void); ORTE_DECLSPEC int orte_ess_base_tool_finalize(void); -ORTE_DECLSPEC int orte_ess_base_orted_setup(void); +ORTE_DECLSPEC int orte_ess_base_orted_setup(char **hosts); ORTE_DECLSPEC int orte_ess_base_orted_finalize(void); /* diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 1f37cd2c23..bd96f68a77 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -50,6 +50,7 @@ #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" #include "orte/util/show_help.h" #include "orte/mca/notifier/base/base.h" @@ -61,7 +62,7 @@ static bool plm_in_use; -int orte_ess_base_orted_setup(void) +int orte_ess_base_orted_setup(char **hosts) { int ret; char *error = NULL; @@ -114,7 +115,7 @@ int orte_ess_base_orted_setup(void) /* Setup the communication infrastructure */ - /* Runtime Messaging Layer */ + /* Runtime Messaging Layer - this opens/selects the OOB as well */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; @@ -125,6 +126,7 @@ int orte_ess_base_orted_setup(void) error = "orte_rml_base_select"; goto error; } + /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); @@ -169,6 +171,49 @@ int orte_ess_base_orted_setup(void) goto error; } + /* if we are using static ports, then we need to setup + * the daemon info so the RML can function properly + * without requiring a wireup stage. This must be done + * after we enable_comm as that function determines our + * own port, which we need in order to construct the nidmap + */ + if (orte_static_ports) { + /* construct the nidmap arrays */ + if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_util_nidmap_init"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { + ORTE_ERROR_LOG(ret); + error = "orte_util_nidmap_init"; + goto error; + } + /* extract the node info from the environment and + * build a nidmap from it + */ + if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) { + ORTE_ERROR_LOG(ret); + error = "construct daemon map from static ports"; + goto error; + } + /* be sure to update the routing tree so the initial "phone home" + * to mpirun goes through the tree! + */ + if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree())) { + ORTE_ERROR_LOG(ret); + error = "failed to update routing tree"; + goto error; + } + } else { + /* initialize the nidmaps */ + if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_util_nidmap_init"; + goto error; + } + } + /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index 1f62f39c46..f235ce3e5b 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -167,8 +167,6 @@ int orte_ess_base_tool_finalize(void) orte_iof_base_close(); orte_routed_base_close(); orte_rml_base_close(); - - orte_session_dir_finalize(ORTE_PROC_MY_NAME); - + return ORTE_SUCCESS; } diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index c2a0fef277..bc7822fe78 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -28,6 +28,7 @@ #ifdef HAVE_UNISTD_H #include #endif +#include #include "opal/event/event.h" #include "opal/runtime/opal.h" @@ -39,6 +40,7 @@ #include "opal/mca/base/mca_base_param.h" #include "opal/util/output.h" #include "opal/util/malloc.h" +#include "opal/util/argv.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/rml/rml_types.h" @@ -110,10 +112,17 @@ orte_ess_base_module_t orte_ess_env_module = { #endif }; +/* + * Local variables + */ +static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID; + static int rte_init(void) { int ret; char *error = NULL; + char **hosts = NULL; + char *nodelist; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -128,13 +137,23 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { + /* get the list of nodes used for this job */ + nodelist = getenv("OMPI_MCA_orte_nodelist"); + + if (NULL != nodelist) { + /* split the node list into an argv array */ + hosts = opal_argv_split(nodelist, ','); + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - - } else if (ORTE_PROC_IS_TOOL) { + opal_argv_free(hosts); + return ORTE_SUCCESS; + } + + if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); @@ -144,17 +163,17 @@ static int rte_init(void) /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; - } else { - /* otherwise, I must be an application process - use - * the default procedure to finish my setup - */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } } - + + /* otherwise, I must be an application process - use + * the default procedure to finish my setup + */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_app_setup"; + goto error; + } + /* if one was provided, build my nidmap */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); @@ -324,6 +343,17 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) { orte_pmap_t *pmap; + /* is this me? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid && + proc->vpid == ORTE_PROC_MY_NAME->vpid) { + /* yes it is - reply with my rank. This is necessary + * because the pidmap will not have arrived when I + * am starting up, and if we use static ports, then + * I need to know my node rank during init + */ + return my_node_rank; + } + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { return ORTE_NODE_RANK_INVALID; } @@ -365,34 +395,34 @@ static int update_nidmap(opal_byte_object_t *bo) static int env_set_name(void) { - char *jobid_str, *procid_str; - int id, rc; + char *tmp; + int rc; orte_jobid_t jobid; orte_vpid_t vpid; - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_str); - if (NULL == jobid_str) { + mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } - free(jobid_str); + free(tmp); - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &procid_str); - if (NULL == procid_str) { + mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } - free(procid_str); + free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; @@ -400,6 +430,15 @@ static int env_set_name(void) OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* get my node rank in case we are using static ports - this won't + * be present for daemons, so don't error out if we don't have it + */ + mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank", + true, false, NULL, &tmp); + if (NULL != tmp) { + my_node_rank = strtol(tmp, NULL, 10); + } + /* get the non-name common environmental variables */ if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index bbc4ae5c2f..8418622dc0 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -76,12 +76,18 @@ orte_ess_base_module_t orte_ess_lsf_module = { NULL /* ft_event */ }; +/* + * Local variables + */ +static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID; + static int rte_init(void) { int ret; char *error = NULL; - orte_jmap_t *jmap; + char **hosts = NULL; + char *nodelist; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -96,12 +102,23 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { + /* get the list of nodes used for this job */ + nodelist = getenv("OMPI_MCA_orte_nodelist"); + + if (NULL != nodelist) { + /* split the node list into an argv array */ + hosts = opal_argv_split(nodelist, ','); + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - } else if (ORTE_PROC_IS_TOOL) { + opal_argv_free(hosts); + return ORTE_SUCCESS; + } + + if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); @@ -111,15 +128,15 @@ static int rte_init(void) /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; - } else { - /* otherwise, I must be an application process - use - * the default procedure to finish my setup - */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } + } + + /* otherwise, I must be an application process - use + * the default procedure to finish my setup + */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_app_setup"; + goto error; } /* setup the nidmap arrays */ @@ -292,6 +309,17 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) { orte_pmap_t *pmap; + /* is this me? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid && + proc->vpid == ORTE_PROC_MY_NAME->vpid) { + /* yes it is - reply with my rank. This is necessary + * because the pidmap will not have arrived when I + * am starting up, and if we use static ports, then + * I need to know my node rank during init + */ + return my_node_rank; + } + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_NODE_RANK_INVALID; @@ -332,36 +360,34 @@ static int update_nidmap(opal_byte_object_t *bo) static int lsf_set_name(void) { int rc; - int id; int lsf_nodeid; orte_jobid_t jobid; orte_vpid_t vpid; - char* jobid_string; - char* vpid_string; + char* tmp; - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_string); - if (NULL == jobid_string) { + mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != - (rc = orte_util_convert_string_to_jobid(&jobid, jobid_string))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } + free(tmp); - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &vpid_string); - if (NULL == vpid_string) { + mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != - (rc = orte_util_convert_string_to_vpid(&vpid, vpid_string))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } + free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; @@ -370,6 +396,15 @@ static int lsf_set_name(void) lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); ORTE_PROC_MY_NAME->vpid = lsf_nodeid; + /* get my node rank in case we are using static ports - this won't + * be present for daemons, so don't error out if we don't have it + */ + mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank", + true, false, NULL, &tmp); + if (NULL != tmp) { + my_node_rank = strtol(tmp, NULL, 10); + } + /* get the non-name common environmental variables */ if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 006e673a0f..061add1316 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -21,16 +21,6 @@ #include "orte_config.h" #include "orte/constants.h" -#ifdef HAVE_SYS_SOCKET_H -#include -#endif -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_ARPA_INET_H -#include -#endif - #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ @@ -38,12 +28,6 @@ #include #endif /* HAVE_STRING_H */ #include -#ifdef HAVE_NETDB_H -#include -#endif -#ifdef HAVE_IFADDRS_H -#include -#endif #include "opal/util/opal_environ.h" @@ -60,7 +44,6 @@ #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/util/nidmap.h" -#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/base/base.h" @@ -68,7 +51,6 @@ static char *get_slurm_nodename(int nodeid); static int slurm_set_name(void); -static int build_daemon_nidmap(void); static int rte_init(void); static int rte_finalize(void); @@ -98,11 +80,18 @@ orte_ess_base_module_t orte_ess_slurm_module = { NULL /* ft_event */ }; +/* + * Local variables + */ +static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID; + static int rte_init(void) { int ret; char *error = NULL; + char **hosts = NULL; + char *slurm_nodelist; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -117,38 +106,24 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { + /* get the list of nodes used for this job */ + mca_base_param_reg_string_name("orte", "nodelist", "List of nodes in job", + true, false, NULL, &slurm_nodelist); + + if (NULL != slurm_nodelist) { + /* split the node list into an argv array */ + hosts = opal_argv_split(slurm_nodelist, ','); + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - /* if we are using static ports, then we need to setup - * the daemon info so the RML can function properly - * without requiring a wireup stage - */ - if (orte_static_ports) { - /* construct the nidmap arrays */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_util_nidmap_init"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { - ORTE_ERROR_LOG(ret); - error = "orte_util_nidmap_init"; - goto error; - } - /* extract the node info from the environment and - * build a nidmap from it - */ - if (ORTE_SUCCESS != (ret = build_daemon_nidmap())) { - ORTE_ERROR_LOG(ret); - error = "construct daemon map from static ports"; - goto error; - } - return ORTE_SUCCESS; - } - } else if (ORTE_PROC_IS_TOOL) { + opal_argv_free(hosts); + return ORTE_SUCCESS; + } + + if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); @@ -158,24 +133,23 @@ static int rte_init(void) /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; - } else { - /* otherwise, I must be an application process - use - * the default procedure to finish my setup - */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } } + /* otherwise, I must be an application process - use + * the default procedure to finish my setup + */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_app_setup"; + goto error; + } /* setup the nidmap arrays */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } - + return ORTE_SUCCESS; error: @@ -344,6 +318,17 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) { orte_pmap_t *pmap; + /* is this me? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid && + proc->vpid == ORTE_PROC_MY_NAME->vpid) { + /* yes it is - reply with my rank. This is necessary + * because the pidmap will not have arrived when I + * am starting up, and if we use static ports, then + * I need to know my node rank during init + */ + return my_node_rank; + } + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { return ORTE_NODE_RANK_INVALID; } @@ -387,37 +372,37 @@ static int slurm_set_name(void) { int slurm_nodeid; int rc; - int id; orte_jobid_t jobid; orte_vpid_t vpid; - char* jobid_string; - char* vpid_string; + char* tmp; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:slurm setting name")); - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_string); - if (NULL == jobid_string) { + mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_string))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } + free(tmp); - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &vpid_string); - if (NULL == vpid_string) { + mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", + true, false, NULL, &tmp); + if (NULL == tmp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, vpid_string))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, tmp))) { ORTE_ERROR_LOG(rc); return(rc); } + free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; @@ -429,6 +414,15 @@ static int slurm_set_name(void) OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* get my node rank in case we are using static ports - this won't + * be present for daemons, so don't error out if we don't have it + */ + mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank", + true, false, NULL, &tmp); + if (NULL != tmp) { + my_node_rank = strtol(tmp, NULL, 10); + } + /* fix up the system info nodename to match exactly what slurm returned */ if (NULL != orte_process_info.nodename) { free(orte_process_info.nodename); @@ -456,8 +450,8 @@ get_slurm_nodename(int nodeid) char *slurm_nodelist; char *ret; - slurm_nodelist = getenv("OMPI_MCA_orte_slurm_nodelist"); - + mca_base_param_reg_string_name("orte", "nodelist", "List of nodes in job", + true, false, NULL, &slurm_nodelist); if (NULL == slurm_nodelist) { return NULL; } @@ -480,105 +474,3 @@ get_slurm_nodename(int nodeid) /* All done */ return ret; } - -static int build_daemon_nidmap(void) -{ - char **names = NULL; - char *slurm_nodelist; - orte_nid_t *node; - int i, num_nodes; - int rc; - struct hostent *h; - opal_buffer_t buf; - orte_process_name_t proc; - char *uri, *addr; - char *proc_name; - - OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output, - "ess:slurm build daemon nidmap")); - - slurm_nodelist = getenv("OMPI_MCA_orte_slurm_nodelist"); - - if (NULL == slurm_nodelist) { - return ORTE_ERR_NOT_FOUND; - } - - /* split the node list into an argv array */ - names = opal_argv_split(slurm_nodelist, ','); - if (NULL == names) { /* got an error */ - return ORTE_ERR_NOT_FOUND; - } - - num_nodes = opal_argv_count(names); - - OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output, - "ess:slurm:build:daemon:nidmap found %d nodes", num_nodes)); - - /* set the size of the nidmap storage so we minimize realloc's */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) { - return rc; - } - - /* install the entry for the HNP */ - node = OBJ_NEW(orte_nid_t); - node->name = strdup("HNP"); - node->daemon = 0; - /* the arch defaults to our arch so that non-hetero - * case will yield correct behavior - */ - opal_pointer_array_set_item(&orte_nidmap, 0, node); - - /* the daemon vpids will be assigned in order, - * starting with vpid=1 for the first node in - * the list - */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - proc.jobid = ORTE_PROC_MY_NAME->jobid; - for (i=0; i < num_nodes; i++) { - node = OBJ_NEW(orte_nid_t); - node->name = strdup(names[i]); - node->daemon = i+1; - /* the arch defaults to our arch so that non-hetero - * case will yield correct behavior - */ - opal_pointer_array_set_item(&orte_nidmap, node->daemon, node); - - opal_output(0, "%s lookup address for node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name); - /* lookup the address of this node */ - if (NULL == (h = gethostbyname(node->name))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]); - - OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output, - "ess:slurm:build:daemon:nidmap node %s daemon %d addr %s", - node->name, (int)node->daemon, addr)); - - /* since we are using static ports, all my fellow daemons will be on my - * port. Setup the contact info for each daemon in my hash tables. Note - * that this will -not- open a port to those daemons, but will only - * define the info necessary for opening such a port if/when I communicate - * to them - */ - /* construct the URI */ - proc.vpid = node->daemon; -orte_util_convert_process_name_to_string(&proc_name, &proc); - asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port); -opal_output(0, "contact info %s", uri); - opal_dss.pack(&buf, &uri, 1, OPAL_STRING); - free(proc_name); - free(uri); - } - - /* load the hash tables */ - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&buf); - - opal_argv_free(names); - - /* All done */ - return ORTE_SUCCESS; -} diff --git a/orte/mca/ess/tm/Makefile.am b/orte/mca/ess/tm/Makefile.am new file mode 100644 index 0000000000..839455f202 --- /dev/null +++ b/orte/mca/ess/tm/Makefile.am @@ -0,0 +1,43 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + ess_tm.h \ + ess_tm_component.c \ + ess_tm_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_ess_tm_DSO +component_noinst = +component_install = mca_ess_tm.la +else +component_noinst = libmca_ess_tm.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_ess_tm_la_SOURCES = $(sources) +mca_ess_tm_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_ess_tm_la_SOURCES =$(sources) +libmca_ess_tm_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/ess/tm/configure.m4 b/orte/mca/ess/tm/configure.m4 new file mode 100644 index 0000000000..3ca4c32100 --- /dev/null +++ b/orte/mca/ess/tm/configure.m4 @@ -0,0 +1,37 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ess_tm_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_ess_tm_CONFIG],[ + OMPI_CHECK_TM([ess_tm], [ess_tm_good=1], [ess_tm_good=0]) + + # if check worked, set wrapper flags if so. + # Evaluate succeed / fail + AS_IF([test "$ess_tm_good" = "1"], + [ess_tm_WRAPPER_EXTRA_LDFLAGS="$ess_tm_LDFLAGS" + ess_tm_WRAPPER_EXTRA_LIBS="$ess_tm_LIBS" + $1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([ess_tm_CPPFLAGS]) + AC_SUBST([ess_tm_LDFLAGS]) + AC_SUBST([ess_tm_LIBS]) +])dnl diff --git a/orte/mca/ess/tm/configure.params b/orte/mca/ess/tm/configure.params new file mode 100644 index 0000000000..08f3f59a30 --- /dev/null +++ b/orte/mca/ess/tm/configure.params @@ -0,0 +1,27 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_CONFIG_FILES="Makefile" +# +# Set the config priority so that, if we can build, +# all the SLURM and supporting components will build + +PARAM_CONFIG_PRIORITY=10 diff --git a/orte/mca/ess/tm/ess_tm.h b/orte/mca/ess/tm/ess_tm.h new file mode 100644 index 0000000000..4047e3bea3 --- /dev/null +++ b/orte/mca/ess/tm/ess_tm.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef ORTE_ESS_TM_H +#define ORTE_ESS_TM_H + +BEGIN_C_DECLS + +ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_tm_component; + +/* + * Module open / close + */ +int orte_ess_tm_component_open(void); +int orte_ess_tm_component_close(void); +int orte_ess_tm_component_query(mca_base_module_t **module, int *priority); + +END_C_DECLS + +#endif /* ORTE_ESS_TM_H */ diff --git a/orte/mca/ess/tm/ess_tm_component.c b/orte/mca/ess/tm/ess_tm_component.c new file mode 100644 index 0000000000..106a3365e9 --- /dev/null +++ b/orte/mca/ess/tm/ess_tm_component.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/proc_info.h" + +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/tm/ess_tm.h" + +extern orte_ess_base_module_t orte_ess_tm_module; + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ +orte_ess_base_component_t mca_ess_tm_component = { + { + ORTE_ESS_BASE_VERSION_2_0_0, + + /* Component name and version */ + "tm", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_ess_tm_component_open, + orte_ess_tm_component_close, + orte_ess_tm_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + + +int +orte_ess_tm_component_open(void) +{ + return ORTE_SUCCESS; +} + + +int orte_ess_tm_component_query(mca_base_module_t **module, int *priority) +{ + /* Are we running under a TM job? Were + * we given a path back to the HNP? If the + * answer to both is "yes", then we were launched + * by mpirun in a tm world + */ + + if (NULL != getenv("PBS_JOBID") && + NULL != orte_process_info.my_hnp_uri) { + *priority = 30; + *module = (mca_base_module_t *)&orte_ess_tm_module; + return ORTE_SUCCESS; + } + + /* Sadly, no */ + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} + + +int +orte_ess_tm_component_close(void) +{ + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ess/tm/ess_tm_module.c b/orte/mca/ess/tm/ess_tm_module.c new file mode 100644 index 0000000000..139610791f --- /dev/null +++ b/orte/mca/ess/tm/ess_tm_module.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#include + + +#include "opal/util/opal_environ.h" +#include "opal/util/output.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/argv.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/dss/dss.h" +#include "opal/mca/paffinity/paffinity.h" + +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/nidmap.h" + +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/base/base.h" +#include "orte/mca/ess/tm/ess_tm.h" + +static int tm_set_name(void); + +static int rte_init(void); +static int rte_finalize(void); +static uint8_t proc_get_locality(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); +static char* proc_get_hostname(orte_process_name_t *proc); +static uint32_t proc_get_arch(orte_process_name_t *proc); +static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); +static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); +static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int update_pidmap(opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); + +orte_ess_base_module_t orte_ess_tm_module = { + rte_init, + rte_finalize, + orte_ess_base_app_abort, + proc_get_locality, + proc_get_daemon, + proc_get_hostname, + proc_get_arch, + proc_get_local_rank, + proc_get_node_rank, + update_arch, + update_pidmap, + update_nidmap, + NULL /* ft_event */ +}; + +/* + * Local variables + */ +static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID; + + +static int rte_init(void) +{ + int ret; + char *error = NULL; + char **hosts = NULL; + char *nodelist; + + /* run the prolog */ + if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { + error = "orte_ess_base_std_prolog"; + goto error; + } + + /* Start by getting a unique name */ + tm_set_name(); + + /* if I am a daemon, complete my setup using the + * default procedure + */ + if (ORTE_PROC_IS_DAEMON) { + /* get the list of nodes used for this job */ + nodelist = getenv("OMPI_MCA_orte_nodelist"); + + if (NULL != nodelist) { + /* split the node list into an argv array */ + hosts = opal_argv_split(nodelist, ','); + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_orted_setup"; + goto error; + } + opal_argv_free(hosts); + return ORTE_SUCCESS; + } + + if (ORTE_PROC_IS_TOOL) { + /* otherwise, if I am a tool proc, use that procedure */ + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_tool_setup"; + goto error; + } + /* as a tool, I don't need a nidmap - so just return now */ + return ORTE_SUCCESS; + + } + + /* otherwise, I must be an application process - use + * the default procedure to finish my setup + */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_app_setup"; + goto error; + } + + /* setup the nidmap arrays */ + if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { + ORTE_ERROR_LOG(ret); + error = "orte_util_nidmap_init"; + goto error; + } + + return ORTE_SUCCESS; + +error: + orte_show_help("help-orte-runtime.txt", + "orte_init:startup:internal-failure", + true, error, ORTE_ERROR_NAME(ret), ret); + + return ret; +} + +static int rte_finalize(void) +{ + int ret; + + /* if I am a daemon, finalize using the default procedure */ + if (ORTE_PROC_IS_DAEMON) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { + ORTE_ERROR_LOG(ret); + } + } else if (ORTE_PROC_IS_TOOL) { + /* otherwise, if I am a tool proc, use that procedure */ + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { + ORTE_ERROR_LOG(ret); + } + /* as a tool, I didn't create a nidmap - so just return now */ + return ret; + } else { + /* otherwise, I must be an application process + * use the default procedure to finish + */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { + ORTE_ERROR_LOG(ret); + } + } + + /* deconstruct my nidmap and jobmap arrays */ + orte_util_nidmap_finalize(); + + return ret; +} + +static uint8_t proc_get_locality(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return OPAL_PROC_NON_LOCAL; + } + + if (nid->daemon == ORTE_PROC_MY_DAEMON->vpid) { + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s is LOCAL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + return (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER); + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s is REMOTE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + return OPAL_PROC_NON_LOCAL; + +} + +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + /* don't generate an error message here - it could be a call to + * get a route to a proc in an unknown job. Let the caller decide + * if an error message is required + */ + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + +static char* proc_get_hostname(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return NULL; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s is on host %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + nid->name)); + + return nid->name; +} + +static uint32_t proc_get_arch(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return 0; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s has arch %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + nid->arch)); + + return nid->arch; +} + +static int update_arch(orte_process_name_t *proc, uint32_t arch) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: updating proc %s to arch %0x", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + arch)); + + nid->arch = arch; + + return ORTE_SUCCESS; +} + +static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc) +{ + orte_pmap_t *pmap; + + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_LOCAL_RANK_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s has local rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + (int)pmap->local_rank)); + + return pmap->local_rank; +} + +static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) +{ + orte_pmap_t *pmap; + + /* is this me? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid && + proc->vpid == ORTE_PROC_MY_NAME->vpid) { + /* yes it is - reply with my rank. This is necessary + * because the pidmap will not have arrived when I + * am starting up, and if we use static ports, then + * I need to know my node rank during init + */ + return my_node_rank; + } + + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { + return ORTE_NODE_RANK_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: proc %s has node rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + (int)pmap->node_rank)); + + return pmap->node_rank; +} + +static int update_pidmap(opal_byte_object_t *bo) +{ + int ret; + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:tm: updating pidmap", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + +static int tm_set_name(void) +{ + int rc; + orte_jobid_t jobid; + orte_vpid_t vpid; + char* tmp; + + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "ess:tm setting name")); + + mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", + true, false, NULL, &tmp); + if (NULL == tmp) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { + ORTE_ERROR_LOG(rc); + return(rc); + } + free(tmp); + + mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", + true, false, NULL, &tmp); + if (NULL == tmp) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, tmp))) { + ORTE_ERROR_LOG(rc); + return(rc); + } + free(tmp); + + ORTE_PROC_MY_NAME->jobid = jobid; + ORTE_PROC_MY_NAME->vpid = vpid; + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "ess:tm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* get my node rank in case we are using static ports - this won't + * be present for daemons, so don't error out if we don't have it + */ + mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank", + true, false, NULL, &tmp); + if (NULL != tmp) { + my_node_rank = strtol(tmp, NULL, 10); + } + + /* get the non-name common environmental variables */ + if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 3ab9ab9379..9d5d1a5031 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -921,6 +921,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_std_cntr_t proc_rank; orte_odls_job_t *jobdat; orte_local_rank_t local_rank; + orte_node_rank_t node_rank; char *pathenv = NULL, *mpiexec_pathenv = NULL; char basedir[MAXPATHLEN]; char dir[MAXPATHLEN]; @@ -1285,7 +1286,31 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); - /* if we are timing things, record when we are going to launch this proc */ + /* users would appreciate being given a public environmental variable + * that also represents the node rank value - something MPI specific - so + * do that here. + * + * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. + * We know - just live with it + */ + if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) { + ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); + rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; + goto CLEANUP; + } + asprintf(&value, "%lu", (unsigned long) node_rank); + opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env); + /* set an mca param for it too */ + if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto CLEANUP; + } + opal_setenv(param, value, true, &app->env); + free(param); + free(value); + + /* if we are timing things, record when we are going to launch this proc */ if (orte_timing) { gettimeofday(&child->starttime, NULL); } diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 6b5240f798..3f5fbc3de9 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -796,6 +796,9 @@ socket_binded: * remembering to convert it back from network byte order first */ orte_process_info.my_port = ntohs(*target_port); + if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) { + opal_output(0, "%s assigned port %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.my_port); + } /* setup listen backlog to maximum allowed by kernel */ if(listen(*target_sd, SOMAXCONN) < 0) { @@ -1406,32 +1409,46 @@ int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer) * just look to see which static port family was provided */ if (NULL != mca_oob_tcp_component.tcp4_static_ports) { - /* lookup the node rank of the proc */ - if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(&peer->peer_name)) || - (nrank+1) > opal_argv_count(mca_oob_tcp_component.tcp4_static_ports)) { - /* this isn't an error - it just means we don't know - * how to compute a contact info for this proc + if (ORTE_JOBID_IS_DAEMON(peer->peer_name.jobid)) { + /* we are trying to talk to a daemon, which will always + * be listening on the first port in the range */ - rc = ORTE_ERR_ADDRESSEE_UNKNOWN; - goto unlock; + port = strtol(mca_oob_tcp_component.tcp4_static_ports[0], NULL, 10); + } else { + /* lookup the node rank of the proc */ + if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(&peer->peer_name)) || + (nrank+1) > opal_argv_count(mca_oob_tcp_component.tcp4_static_ports)) { + /* this isn't an error - it just means we don't know + * how to compute a contact info for this proc + */ + rc = ORTE_ERR_ADDRESSEE_UNKNOWN; + goto unlock; + } + /* any daemon takes the first entry, so we start with the second */ + port = strtol(mca_oob_tcp_component.tcp4_static_ports[nrank+1], NULL, 10); } - /* any daemon takes the first entry, so we start with the second */ - port = strtol(mca_oob_tcp_component.tcp4_static_ports[nrank+1], NULL, 10); /* create the uri */ asprintf(&uri, "tcp://%s:%d", haddr, port); #if OPAL_WANT_IPV6 } else if (NULL != mca_oob_tcp_component.tcp6_static_ports) { - /* lookup the node rank of the proc */ - if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(&peer->peer_name)) || - (nrank+1) > opal_argv_count(mca_oob_tcp_component.tcp6_static_ports)) { - /* this isn't an error - it just means we don't know - * how to compute a contact info for this proc + if (ORTE_JOBID_IS_DAEMON(peer->peer_name.jobid)) { + /* we are trying to talk to a daemon, which will always + * be listening on the first port in the range */ - rc = ORTE_ERR_ADDRESSEE_UNKNOWN; - goto unlock; + port = strtol(mca_oob_tcp_component.tcp6_static_ports[0], NULL, 10); + } else { + /* lookup the node rank of the proc */ + if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(&peer->peer_name)) || + (nrank+1) > opal_argv_count(mca_oob_tcp_component.tcp6_static_ports)) { + /* this isn't an error - it just means we don't know + * how to compute a contact info for this proc + */ + rc = ORTE_ERR_ADDRESSEE_UNKNOWN; + goto unlock; + } + /* any daemon takes the first entry, so we start with the second */ + port = strtol(mca_oob_tcp_component.tcp6_static_ports[nrank+1], NULL, 10); } - /* any daemon takes the first entry, so we start with the second */ - port = strtol(mca_oob_tcp_component.tcp6_static_ports[nrank+1], NULL, 10); /* create the uri */ asprintf(&uri, "tcp6://%s:%d", haddr, port); #endif /* OPAL_WANT_IPV6 */ diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 553231a092..8b8698ec9a 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -275,7 +275,8 @@ static int plm_alps_launch_job(orte_job_t *jdata) orte_plm_base_orted_append_basic_args(&argc, &argv, "alps", &proc_vpid_index, - false); + false, nodelist_flat); + free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end @@ -333,12 +334,6 @@ static int plm_alps_launch_job(orte_job_t *jdata) /* setup environment */ env = opal_argv_copy(orte_launch_environ); - /* add the nodelist */ - var = mca_base_param_environ_variable("orte", "alps", "nodelist"); - opal_setenv(var, nodelist_flat, true, &env); - free(nodelist_flat); - free(var); - if (mca_plm_alps_component.timing) { if (0 != gettimeofday(&launchstart, NULL)) { opal_output(0, "plm_alps: could not obtain start time"); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 05c74b5e1a..460d5c649f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -951,7 +951,7 @@ int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *ess, int *proc_vpid_index, - bool heartbeat) + bool heartbeat, char *nodes) { char *param = NULL; int loc_id; @@ -996,7 +996,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, free(param); } - /* tell the orted what SDS component to use */ + /* tell the orted what ESS component to use */ opal_argv_append(argc, argv, "-mca"); opal_argv_append(argc, argv, "ess"); opal_argv_append(argc, argv, ess); @@ -1043,6 +1043,13 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, param); free(param); + /* if given, pass the node list */ + if (NULL != nodes) { + opal_argv_append(argc, argv, "-mca"); + opal_argv_append(argc, argv, "orte_nodelist"); + opal_argv_append(argc, argv, nodes); + } + /* pass along any cmd line MCA params provided to mpirun, * being sure to "purge" any that would cause problems * on backend nodes @@ -1367,12 +1374,8 @@ CHECK_ALL_JOBS: ORTE_NAME_PRINT(&proc->name))); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); - /* set the entry in the job data object to NULL */ - opal_pointer_array_set_item(jdata->procs, (int)proc->name.vpid, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); - /* now release it again from the job data object */ - OBJ_RELEASE(proc); } } OBJ_RELEASE(map); @@ -1391,7 +1394,13 @@ CHECK_ALL_JOBS: */ continue; } - if (NULL != jdata && job->jobid == jdata->jobid) { + /* if this is the job we are checking AND it normally terminated, + * then go ahead and release it. We cannot release it if it + * abnormally terminated as mpirun needs the info so it can + * report appropriately to the user + */ + if (NULL != jdata && job->jobid == jdata->jobid && + jdata->state == ORTE_JOB_STATE_TERMINATED) { /* release this object, ensuring that the * pointer array internal accounting * is maintained! diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index ced0972333..f731b135df 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -138,9 +138,9 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender, * Construct basic ORTE Daemon command line arguments */ ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, - char *sds, + char *ess_module, int *proc_vpid_index, - bool heartbeat); + bool heartbeat, char *nodes); /* * Proxy functions for use by daemons and application procs diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index 118631eeb3..f43d8d4dca 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -222,7 +222,7 @@ GETMAP: /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "env", &proc_vpid_index, - false); + false, NULL); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index 047f7b170d..4fe0790fbb 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -130,6 +130,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata) int rc; char** env = NULL; char **nodelist_argv; + char *nodelist; int nodelist_argc; char *vpid_string; int i; @@ -207,7 +208,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata) */ opal_argv_append(&nodelist_argc, &nodelist_argv, nodes[nnode]->name); } - + nodelist = opal_argv_join(nodelist_argv, ','); /* * start building argv array @@ -226,7 +227,8 @@ static int plm_lsf_launch_job(orte_job_t *jdata) orte_plm_base_orted_append_basic_args(&argc, &argv, "lsf", &proc_vpid_index, - false); + false, nodelist); + free(nodelist); /* tell the new daemons the base of the name list so they can compute * their own name on the other end diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index f3a37c326a..159c626c91 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -561,7 +561,7 @@ int orte_plm_process_launch(orte_job_t *jdata) orte_plm_base_orted_append_basic_args(&argc, &argv, "env", &proc_vpid_index, - false); + false, NULL); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 08bd76c358..8437ae2fe8 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -659,7 +659,7 @@ static int setup_launch(int *argcptr, char ***argvptr, orte_plm_base_orted_append_basic_args(&argc, &argv, "env", proc_vpid_index, - true); + true, NULL); /* in the rsh environment, we can append multi-word arguments * by enclosing them in quotes. Check for any multi-word diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 254a243c3f..9a32971122 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -151,7 +151,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata) char* var; char *nodelist_flat; char **nodelist_argv; - int nodelist_argc; char *name_string; char **custom_strings; int num_args, i; @@ -272,7 +271,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata) /* create nodelist */ nodelist_argv = NULL; - nodelist_argc = 0; for (n=0; n < map->num_nodes; n++ ) { /* if the daemon already exists on this node, then @@ -285,7 +283,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ - opal_argv_append(&nodelist_argc, &nodelist_argv, nodes[n]->name); + opal_argv_append_nosize(&nodelist_argv, nodes[n]->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true); @@ -311,9 +309,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata) /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, - "slurm", - &proc_vpid_index, - false); + "slurm", &proc_vpid_index, + false, nodelist_flat); + free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end @@ -372,12 +370,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata) /* setup environment */ env = opal_argv_copy(orte_launch_environ); - /* add the nodelist */ - var = mca_base_param_environ_variable("orte", "slurm", "nodelist"); - opal_setenv(var, nodelist_flat, true, &env); - free(nodelist_flat); - free(var); - /* enable local launch by the orteds */ var = mca_base_param_environ_variable("plm", NULL, NULL); opal_setenv(var, "rsh", true, &env); diff --git a/orte/mca/plm/submit/pls_submit_module.c b/orte/mca/plm/submit/pls_submit_module.c index 7d1bf08718..ed98d8b57f 100644 --- a/orte/mca/plm/submit/pls_submit_module.c +++ b/orte/mca/plm/submit/pls_submit_module.c @@ -542,7 +542,7 @@ int orte_plm_submit_launch(orte_job_t *jdata) */ orte_plm_base_orted_append_basic_args(&argc, &argv, "env", - &proc_vpid_index); + &proc_vpid_index, NULL); local_exec_index_end = argc; if (mca_plm_submit_component.debug) { diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 37e31fbd23..beb9d372c8 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -144,6 +144,7 @@ static int plm_tm_launch_job(orte_job_t *jdata) char **env = NULL; char *var; char **argv = NULL; + char **nodeargv; int argc = 0; int rc; bool connected = false; @@ -156,6 +157,7 @@ static int plm_tm_launch_job(orte_job_t *jdata) bool failed_launch = true; mode_t current_umask; orte_jobid_t failed_job; + char *nodelist; if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { /* if this is a request to launch a local slave, @@ -229,11 +231,28 @@ static int plm_tm_launch_job(orte_job_t *jdata) /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); + /* create a list of nodes in this launch */ + nodeargv = NULL; + for (i = 0; i < map->num_nodes; i++) { + orte_node_t* node = nodes[i]; + + /* if this daemon already exists, don't launch it! */ + if (node->daemon_launched) { + continue; + } + + /* add to list */ + opal_argv_append_nosize(&nodeargv, node->name); + } + nodelist = opal_argv_join(nodeargv, ','); + opal_argv_free(nodeargv); + /* Add basic orted command line options */ - orte_plm_base_orted_append_basic_args(&argc, &argv, "env", + orte_plm_base_orted_append_basic_args(&argc, &argv, "tm", &proc_vpid_index, - true); - + true, nodelist); + free(nodelist); + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, diff --git a/orte/mca/plm/tmd/plm_tmd_module.c b/orte/mca/plm/tmd/plm_tmd_module.c index de755f97dd..afa1527268 100644 --- a/orte/mca/plm/tmd/plm_tmd_module.c +++ b/orte/mca/plm/tmd/plm_tmd_module.c @@ -314,7 +314,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata) /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "env", &proc_vpid_index, - true); + true, NULL); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); diff --git a/orte/mca/plm/xgrid/src/plm_xgrid_client.m b/orte/mca/plm/xgrid/src/plm_xgrid_client.m index 5bd6650176..b54b33d068 100644 --- a/orte/mca/plm/xgrid/src/plm_xgrid_client.m +++ b/orte/mca/plm/xgrid/src/plm_xgrid_client.m @@ -438,7 +438,7 @@ cleanup: orte_plm_base_orted_append_basic_args(&argc, &argv, "env", NULL, - true); + true, NULL); /* Note that capacity is a starting capacity, not max */ NSMutableArray *ret = [NSMutableArray arrayWithCapacity: argc]; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 19d3ae007f..d69aaa0f46 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -289,7 +289,8 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) /* start at the beginning... */ vpid_start = 0; - + jdata->num_procs = 0; + /* if loadbalancing is requested, then we need to compute * the #procs/node - note that this cannot be done * if we are doing pernode or if #procs was not given diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index e0f2d94f4f..1a8c06f220 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -88,6 +88,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) /* start at the beginning... */ vpid = 0; + jdata->num_procs = 0; /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->num_apps; i++) { diff --git a/orte/mca/rmaps/topo/rmaps_topo.c b/orte/mca/rmaps/topo/rmaps_topo.c index 65a4b13cc8..e3a114906a 100644 --- a/orte/mca/rmaps/topo/rmaps_topo.c +++ b/orte/mca/rmaps/topo/rmaps_topo.c @@ -291,7 +291,8 @@ static int topo_map(orte_job_t *jdata) /* start at the beginning... */ vpid_start = 0; - + jdata->num_procs = 0; + /* get the graph of nodes */ if (ORTE_SUCCESS != (rc = opal_carto_base_get_host_graph(&graph, "SLOT"))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index adcae297e2..d7ee64b0b5 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -412,7 +412,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) ret = &daemon; found: - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_binomial_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), @@ -563,8 +563,13 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) return rc; } - /* set our lifeline to the HNP - we will abort if that connection is lost */ - lifeline = ORTE_PROC_MY_HNP; + /* if we are using static ports, set my lifeline to point at my parent */ + if (orte_static_ports) { + lifeline = &my_parent; + } else { + /* set our lifeline to the HNP - we will abort if that connection is lost */ + lifeline = ORTE_PROC_MY_HNP; + } /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index a1681c89b2..28a66b2d61 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -146,7 +146,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type) { int rc; - int32_t i, j; + int32_t i, j, np; orte_job_t **jobs; orte_proc_t *proc; @@ -201,20 +201,35 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } - /* pack the number of procs */ + /* pack the number of procs for the job */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(jobs[i]->num_procs)), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } - if (0 < jobs[i]->num_procs) { + /* there might actually not be any procs in the array, so we + * need to count them first + */ + np = 0; + for (j=0; j < jobs[i]->procs->size; j++) { + if (NULL != opal_pointer_array_get_item(jobs[i]->procs, j)) { + np++; + } + } + /* now pack that number */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&np, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (0 < np) { for (j=0; j < jobs[i]->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)&proc, 1, ORTE_PROC))) { + (void*)&proc, 1, ORTE_PROC))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 948e8ff7cc..4cc0d8af59 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -150,10 +150,9 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type) { int rc; - int32_t i, j, n; + int32_t i, j, n, np, nprocs; orte_job_t **jobs; orte_proc_t *proc; - orte_vpid_t np; /* unpack into array of orte_job_t objects */ jobs = (orte_job_t**) dest; @@ -228,14 +227,22 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } - for (np=0; np < jobs[i]->num_procs; np++) { - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - (void**)&proc, &n, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + /* unpack the actual number of proc entries in the message */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, (void*)&nprocs, &n, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 < nprocs) { + for (np=0; np < nprocs; np++) { + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + (void**)&proc, &n, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_pointer_array_set_item(jobs[i]->procs, proc->name.vpid, proc); } - opal_pointer_array_set_item(jobs[i]->procs, proc->name.vpid, proc); } /* if the map is NULL, then we din't pack it as there was diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 0a3ee206df..5d0047c896 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -587,10 +587,6 @@ static void orte_job_destruct(orte_job_t* job) } OBJ_RELEASE(job->procs); - if (NULL != job->aborted_proc) { - OBJ_RELEASE(job->aborted_proc); - } - #if OPAL_ENABLE_FT == 1 if (NULL != job->ckpt_snapshot_ref) { free(job->ckpt_snapshot_ref); diff --git a/orte/util/comm/comm.c b/orte/util/comm/comm.c index 6975cc1a62..d689a626ba 100644 --- a/orte/util/comm/comm.c +++ b/orte/util/comm/comm.c @@ -214,7 +214,7 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node, quicktime = NULL; } ORTE_ERROR_LOG(ret); - OBJ_DESTRUCT(ret); + OBJ_DESTRUCT(&answer); return ret; } @@ -309,7 +309,7 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t quicktime = NULL; } ORTE_ERROR_LOG(ret); - OBJ_DESTRUCT(ret); + OBJ_DESTRUCT(&answer); return ret; } diff --git a/orte/util/name_fns.h b/orte/util/name_fns.h index 9de0075c5d..701053b0a1 100644 --- a/orte/util/name_fns.h +++ b/orte/util/name_fns.h @@ -90,7 +90,7 @@ ORTE_DECLSPEC char* orte_util_print_local_jobid(const orte_jobid_t job); ( ((local) & 0xffff0000) | ((job) & 0x0000ffff) ) /* a macro for identifying that a proc is a daemon */ -#define ORTE_PROC_NAME_IS_DAEMON(n) \ +#define ORTE_JOBID_IS_DAEMON(n) \ !((n) & 0x0000ffff) /* List of names for general use */ diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 3738c04ac3..23d866fb43 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -27,17 +27,34 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif +#ifdef HAVE_ARPA_INET_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#ifdef HAVE_IFADDRS_H +#include +#endif #include "opal/dss/dss.h" #include "opal/runtime/opal.h" #include "opal/class/opal_pointer_array.h" #include "opal/util/output.h" +#include "opal/util/argv.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" +#include "orte/mca/rml/base/rml_contact.h" #include "orte/util/nidmap.h" @@ -155,6 +172,93 @@ int orte_util_setup_local_nidmap_entries(void) return ORTE_SUCCESS; } +int orte_util_build_daemon_nidmap(char **nodes) +{ + orte_nid_t *node; + int i, num_nodes; + int rc; + struct hostent *h; + opal_buffer_t buf; + orte_process_name_t proc; + char *uri, *addr; + char *proc_name; + + num_nodes = opal_argv_count(nodes); + + OPAL_OUTPUT_VERBOSE((2, orte_debug_output, + "orte:util:build:daemon:nidmap found %d nodes", num_nodes)); + + if (0 == num_nodes) { + /* nothing to do */ + return ORTE_SUCCESS; + } + + /* set the size of the nidmap storage so we minimize realloc's */ + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* install the entry for the HNP */ + node = OBJ_NEW(orte_nid_t); + node->name = strdup("HNP"); + node->daemon = 0; + /* the arch defaults to our arch so that non-hetero + * case will yield correct behavior + */ + opal_pointer_array_set_item(&orte_nidmap, 0, node); + + /* the daemon vpids will be assigned in order, + * starting with vpid=1 for the first node in + * the list + */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + proc.jobid = ORTE_PROC_MY_NAME->jobid; + for (i=0; i < num_nodes; i++) { + node = OBJ_NEW(orte_nid_t); + node->name = strdup(nodes[i]); + node->daemon = i+1; + /* the arch defaults to our arch so that non-hetero + * case will yield correct behavior + */ + opal_pointer_array_set_item(&orte_nidmap, node->daemon, node); + + /* lookup the address of this node */ + if (NULL == (h = gethostbyname(node->name))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]); + + OPAL_OUTPUT_VERBOSE((3, orte_debug_output, + "%s orte:util:build:daemon:nidmap node %s daemon %d addr %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, (int)node->daemon, addr)); + + /* since we are using static ports, all my fellow daemons will be on my + * port. Setup the contact info for each daemon in my hash tables. Note + * that this will -not- open a port to those daemons, but will only + * define the info necessary for opening such a port if/when I communicate + * to them + */ + /* construct the URI */ + proc.vpid = node->daemon; + orte_util_convert_process_name_to_string(&proc_name, &proc); + asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port); + opal_dss.pack(&buf, &uri, 1, OPAL_STRING); + free(proc_name); + free(uri); + } + + /* load the hash tables */ + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { + ORTE_ERROR_LOG(rc); + } + OBJ_DESTRUCT(&buf); + + return rc; +} + int orte_util_encode_nodemap(opal_byte_object_t *boptr) { orte_vpid_t *vpids; @@ -1117,7 +1221,7 @@ orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - if (ORTE_PROC_NAME_IS_DAEMON(proc->jobid)) { + if (ORTE_JOBID_IS_DAEMON(proc->jobid)) { /* looking for a daemon */ return find_daemon_node(proc); } diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 6c9ff1f1cd..727c84ff25 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -54,6 +54,7 @@ ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); ORTE_DECLSPEC int orte_util_encode_pidmap(opal_byte_object_t *boptr); ORTE_DECLSPEC int orte_util_decode_pidmap(opal_byte_object_t *boptr); +ORTE_DECLSPEC int orte_util_build_daemon_nidmap(char **nodes); END_C_DECLS #endif