diff --git a/NEWS b/NEWS index ecc8fe322f..f420f39522 100644 --- a/NEWS +++ b/NEWS @@ -65,6 +65,11 @@ Master (not on release branches yet) via --enable-mpi-cxx. - Removed embedded VampirTrace. It is in maintenance mode since 2013. Please consider Score-P (score-p.org) as an external replacement. +- Add a mca parameter ras_base_launch_orted_on_hn to allow for launching + MPI processes on the same node where mpirun is executing using a separate + orte daemon, rather than the mpirun process. This may be useful to set to + true when using SLURM, as it improves interoperability with SLURM's signal + propagation tools. By default it is set to false, except for Cray XC systems. 3.0.0 -- July, 2017 ------------------- diff --git a/orte/mca/ras/base/base.h b/orte/mca/ras/base/base.h index e766dc86ce..f9b1bc868b 100644 --- a/orte/mca/ras/base/base.h +++ b/orte/mca/ras/base/base.h @@ -51,6 +51,7 @@ typedef struct orte_ras_base_t { orte_ras_base_module_t *active_module; int total_slots_alloc; int multiplier; + bool launch_orted_on_hn; } orte_ras_base_t; ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base; diff --git a/orte/mca/ras/base/ras_base_frame.c b/orte/mca/ras/base/ras_base_frame.c index a7a0918c35..f8cb6cedbc 100644 --- a/orte/mca/ras/base/ras_base_frame.c +++ b/orte/mca/ras/base/ras_base_frame.c @@ -59,6 +59,31 @@ static int ras_register(mca_base_register_flag_t flags) NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier); +#if SLURM_CRAY_ENV + /* + * If we are in a Cray-SLURM environment, then we cannot + * launch procs local to the HNP. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. Test for this condition. If + * found, then take steps to ensure we launch a daemon on + * the same node as mpirun and that it gets used to fork + * local procs instead of mpirun so they get the proper + * credential */ + + orte_ras_base.launch_orted_on_hn = true; +#else + orte_ras_base.launch_orted_on_hn = false; +#endif + + mca_base_var_register("orte", "ras", "base", "launch_orted_on_hn", + "Launch an orte daemon on the head node", + MCA_BASE_VAR_TYPE_BOOL, + NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.launch_orted_on_hn); return ORTE_SUCCESS; } diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index 5fd3b3dda2..c3340cb146 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -78,33 +78,24 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) /* get the hnp node's info */ hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); -#if SLURM_CRAY_ENV - /* if we are in a Cray-SLURM environment, then we cannot - * launch procs local to the HNP. The problem - * is the MPI processes launched on the head node (where the - * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon - * (mpirun) which is not a child of a slurmd daemon. This - * means that any RDMA credentials obtained via the odls/alps - * local launcher are incorrect. Test for this condition. If - * found, then take steps to ensure we launch a daemon on - * the same node as mpirun and that it gets used to fork - * local procs instead of mpirun so they get the proper - * credential */ - if (NULL != hnp_node) { - OPAL_LIST_FOREACH(node, nodes, orte_node_t) { - if (orte_ifislocal(node->name)) { - orte_hnp_is_allocated = true; - break; + + if ((orte_ras_base.launch_orted_on_hn == true) && + (orte_managed_allocation)) { + if (NULL != hnp_node) { + OPAL_LIST_FOREACH(node, nodes, orte_node_t) { + if (orte_ifislocal(node->name)) { + orte_hnp_is_allocated = true; + break; + } + } + if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & + ORTE_MAPPING_NO_USE_LOCAL)) { + hnp_node->name = strdup("mpirun"); + skiphnp = true; + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); } } - if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { - hnp_node->name = strdup("mpirun"); - skiphnp = true; - ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); - } } -#endif - /* cycle through the list */ while (NULL != (item = opal_list_remove_first(nodes))) {