From 180809f2eface23f34e4432e28e8e0a07202734c Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 25 Apr 2017 21:24:21 -0700 Subject: [PATCH] Do not pass topologies during tree spawn of daemons as there is no way the HNP can know the backend topologies at that point. Any needed topologies will be sent along with the launch_apps command Do not pass param file MCA params if the user has requested that no param files be read - required when trying to avoid launch time penalties from large numbers of processes reading default param files. The daemon picks them up and passes them along anyway, so it isn't clear what value we gain from having them all read the defaults Signed-off-by: Ralph Castain --- orte/mca/plm/base/plm_base_launch_support.c | 134 +++++++++++-------- orte/mca/plm/rsh/plm_rsh_module.c | 15 --- orte/mca/rmaps/base/rmaps_base_support_fns.c | 2 +- orte/util/nidmap.c | 11 +- 4 files changed, 84 insertions(+), 78 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 49890762f2..fb233fafbf 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1544,51 +1544,34 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, orte_xterm); } - /* - * Pass along the Aggregate MCA Parameter Sets - */ - /* Add the 'prefix' param */ - tmp_value = NULL; - - loc_id = mca_base_var_find("opal", "mca", "base", "envar_file_prefix"); + loc_id = mca_base_var_find("opal", "mca", "base", "param_files"); if (loc_id < 0) { rc = OPAL_ERR_NOT_FOUND; ORTE_ERROR_LOG(rc); return rc; } + tmp_value = NULL; rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } - if( NULL != tmp_value && NULL != tmp_value[0] ) { - /* Could also use the short version '-tune' - * but being verbose has some value - */ - opal_argv_append(argc, argv, "-mca"); - opal_argv_append(argc, argv, "mca_base_envar_file_prefix"); - opal_argv_append(argc, argv, tmp_value[0]); + if (NULL != tmp_value && NULL != tmp_value[0]) { + rc = strcmp(tmp_value[0], "none"); + } else { + rc = 1; } - tmp_value2 = NULL; - loc_id = mca_base_var_find("opal", "mca", "base", "param_file_prefix"); - mca_base_var_get_value(loc_id, &tmp_value2, NULL, NULL); - if( NULL != tmp_value2 && NULL != tmp_value2[0] ) { - /* Could also use the short version '-am' - * but being verbose has some value + if (0 != rc) { + /* + * Pass along the Aggregate MCA Parameter Sets */ - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "mca_base_param_file_prefix"); - opal_argv_append(argc, argv, tmp_value2[0]); - orte_show_help("help-plm-base.txt", "deprecated-amca", true); - } - - if ((NULL != tmp_value && NULL != tmp_value[0]) - || (NULL != tmp_value2 && NULL != tmp_value2[0])) { - /* Add the 'path' param */ + /* Add the 'prefix' param */ tmp_value = NULL; - loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path"); + + loc_id = mca_base_var_find("opal", "mca", "base", "envar_file_prefix"); if (loc_id < 0) { + rc = OPAL_ERR_NOT_FOUND; ORTE_ERROR_LOG(rc); return rc; } @@ -1598,39 +1581,76 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, return rc; } if( NULL != tmp_value && NULL != tmp_value[0] ) { - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "mca_base_param_file_path"); + /* Could also use the short version '-tune' + * but being verbose has some value + */ + opal_argv_append(argc, argv, "-mca"); + opal_argv_append(argc, argv, "mca_base_envar_file_prefix"); opal_argv_append(argc, argv, tmp_value[0]); } - /* Add the 'path' param */ - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "mca_base_param_file_path_force"); + tmp_value2 = NULL; + loc_id = mca_base_var_find("opal", "mca", "base", "param_file_prefix"); + mca_base_var_get_value(loc_id, &tmp_value2, NULL, NULL); + if( NULL != tmp_value2 && NULL != tmp_value2[0] ) { + /* Could also use the short version '-am' + * but being verbose has some value + */ + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "mca_base_param_file_prefix"); + opal_argv_append(argc, argv, tmp_value2[0]); + orte_show_help("help-plm-base.txt", "deprecated-amca", true); + } - tmp_value = NULL; - loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path_force"); - if (loc_id < 0) { - rc = OPAL_ERR_NOT_FOUND; - ORTE_ERROR_LOG(rc); - return rc; - } - rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); - if (OPAL_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - if( NULL == tmp_value || NULL == tmp_value[0] ) { - /* Get the current working directory */ - tmp_force = (char *) malloc(sizeof(char) * OPAL_PATH_MAX); - if (NULL == getcwd(tmp_force, OPAL_PATH_MAX)) { - free(tmp_force); - tmp_force = strdup(""); + if ((NULL != tmp_value && NULL != tmp_value[0]) + || (NULL != tmp_value2 && NULL != tmp_value2[0])) { + /* Add the 'path' param */ + tmp_value = NULL; + loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path"); + if (loc_id < 0) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + if( NULL != tmp_value && NULL != tmp_value[0] ) { + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "mca_base_param_file_path"); + opal_argv_append(argc, argv, tmp_value[0]); } - opal_argv_append(argc, argv, tmp_force); - free(tmp_force); - } else { - opal_argv_append(argc, argv, tmp_value[0]); + /* Add the 'path' param */ + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "mca_base_param_file_path_force"); + + tmp_value = NULL; + loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path_force"); + if (loc_id < 0) { + rc = OPAL_ERR_NOT_FOUND; + ORTE_ERROR_LOG(rc); + return rc; + } + rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); + if (OPAL_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + if( NULL == tmp_value || NULL == tmp_value[0] ) { + /* Get the current working directory */ + tmp_force = (char *) malloc(sizeof(char) * OPAL_PATH_MAX); + if (NULL == getcwd(tmp_force, OPAL_PATH_MAX)) { + free(tmp_force); + tmp_force = strdup(""); + } + + opal_argv_append(argc, argv, tmp_force); + free(tmp_force); + } else { + opal_argv_append(argc, argv, tmp_value[0]); + } } } diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index ac1f501c39..9164f5870f 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -800,15 +800,6 @@ static int remote_spawn(opal_buffer_t *launch) goto cleanup; } - /* extract and update the daemon map */ - if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* since we are tree-spawning, we need to update the routing plan */ - orte_routed.update_routing_plan(NULL); - /* get the updated routing list */ rtmod = orte_rml.get_routed(orte_coll_conduit); OBJ_CONSTRUCT(&coll, opal_list_t); @@ -1177,12 +1168,6 @@ static void launch_daemons(int fd, short args, void *cbdata) OBJ_RELEASE(orte_tree_launch_cmd); goto cleanup; } - /* construct a nodemap of all daemons we know about */ - if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(orte_tree_launch_cmd))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(orte_tree_launch_cmd); - goto cleanup; - } /* get the orted job data object */ if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 4bc44bf3b0..6fd1d7cec0 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -413,7 +413,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr * are getting for an initial map of a job, * then mark all nodes as unmapped */ - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } if (NULL == nd || NULL == nd->daemon || NULL == node->daemon || diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 02ef5b8e7d..836c55625e 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -447,7 +447,6 @@ int orte_util_nidmap_create(char **regex) asprintf(&tmp2, "%s@%s", nodenames, tmp); free(nodenames); free(tmp); - *regex = tmp2; return ORTE_SUCCESS; } @@ -760,9 +759,10 @@ int orte_util_nidmap_parse(char *regex) dvpids[n][strlen(dvpids[n])-2] = '\0'; // remove trailing paren ++ptr; rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; } - /* convert the number - since it might be a range, - * save the remainder pointer */ + /* convert the number */ rng->vpid = strtoul(dvpids[n], NULL, 10); } opal_argv_free(dvpids); @@ -797,16 +797,17 @@ int orte_util_nidmap_parse(char *regex) nd->daemon = proc; } ++cnt; - if (cnt == rng->cnt) { + if (rng->cnt <= cnt) { rng = (orte_regex_range_t*)opal_list_get_next(&rng->super); if (NULL == rng) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } + cnt = 0; } } - /* unpdate num procs */ + /* update num procs */ if (orte_process_info.num_procs != daemons->num_procs) { orte_process_info.num_procs = daemons->num_procs; /* need to update the routing plan */