1
1

Add an MCA param "hnp_on_smgmt_node" that mpirun can use to tell the orteds to ignore its topology signature as mpirun is executing on a system mgmt node, and hence a different topology than the compute nodes

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-01-16 19:32:01 -08:00
родитель 568b58af75
Коммит e9bc2934be
6 изменённых файлов: 25 добавлений и 4 удалений

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
@ -1341,6 +1341,12 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
opal_argv_append(argc, argv, "1");
}
if (orte_hnp_on_smgmt_node) {
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
opal_argv_append(argc, argv, "orte_hnp_on_smgmt_node");
opal_argv_append(argc, argv, "1");
}
if (orte_map_stddiag_to_stderr) {
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
opal_argv_append(argc, argv, "orte_map_stddiag_to_stderr");

Просмотреть файл

@ -16,7 +16,7 @@
* Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -129,6 +129,7 @@ static struct {
bool tree_spawn;
char *hnp_topo_sig;
bool test_suicide;
bool hnp_on_smgmt_node;
} orted_globals;
/*
@ -216,6 +217,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orted_globals.hnp_topo_sig, OPAL_CMD_LINE_TYPE_STRING,
"Topology signature of HNP" },
{ "orte_hnp_on_smgmt_node", '\0', NULL, "hnp-on-smgmt-node", 0,
&orted_globals.hnp_on_smgmt_node, OPAL_CMD_LINE_TYPE_BOOL,
"Mpirun is executing on a system mgmt node whose topology is different from the compute nodes [Default = false]" },
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -767,7 +772,7 @@ int orte_daemon(int argc, char *argv[])
/* add the local topology, if different from the HNP's or user directed us to,
* but always if we are the first daemon to ensure we get a compute node */
if (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes ||
0 != strcmp(orte_topo_signature, orted_globals.hnp_topo_sig)) {
(!orted_globals.hnp_on_smgmt_node && 0 != strcmp(orte_topo_signature, orted_globals.hnp_topo_sig))) {
tflag = 1;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &tflag, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*

Просмотреть файл

@ -92,6 +92,7 @@ int orted_debug_failure = -1;
int orted_debug_failure_delay = -1;
bool orte_hetero_apps = false;
bool orte_hetero_nodes = false;
bool orte_hnp_on_smgmt_node = false;
bool orte_never_launched = false;
bool orte_devel_level_output = false;
bool orte_display_topo_with_map = false;

Просмотреть файл

@ -475,6 +475,7 @@ ORTE_DECLSPEC extern int orted_debug_failure_delay;
/* homegeneity flags */
ORTE_DECLSPEC extern bool orte_hetero_apps;
ORTE_DECLSPEC extern bool orte_hetero_nodes;
ORTE_DECLSPEC extern bool orte_hnp_on_smgmt_node;
ORTE_DECLSPEC extern bool orte_never_launched;
ORTE_DECLSPEC extern bool orte_devel_level_output;

Просмотреть файл

@ -767,5 +767,13 @@ int orte_register_params(void)
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &orte_mgmt_transport);
orte_hnp_on_smgmt_node = false;
(void) mca_base_var_register ("orte", "orte", NULL, "hnp_on_smgmt_node",
"Mpirun is executing on a system mgmt node whose topology is different from the compute nodes [Default = false]",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_hnp_on_smgmt_node);
return ORTE_SUCCESS;
}