Allow daemons to use PMI to get their name where PMI support is available while using the standard grpcomm and other capabilities. Remove the GNI code from the alps ess component as that component should only be for alps/cnos installations.
This commit was SVN r25737.
Этот коммит содержится в:
родитель
6235a355de
Коммит
9d556e2f17
@ -26,7 +26,7 @@
|
||||
# than the CNOS component to ensure we don't get both
|
||||
# since the ALPS component will -only- build if specifically
|
||||
# ordered to do so - which means we don't want the CNOS one
|
||||
AC_DEFUN([MCA_orte_ess_alps_PRIORITY], [10])
|
||||
AC_DEFUN([MCA_orte_ess_alps_PRIORITY], [15])
|
||||
|
||||
# MCA_ess_alps_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
@ -53,10 +53,6 @@ AC_DEFUN([MCA_orte_ess_alps_CONFIG],[
|
||||
[orte_mca_ess_alps_happy="yes"],
|
||||
[orte_mca_ess_alps_happy="no"])
|
||||
|
||||
AC_DEFINE_UNQUOTED([ORTE_MCA_ESS_ALPS_HAVE_CNOS],
|
||||
[$orte_mca_ess_alps_have_cnos],
|
||||
[Whether we have CNOS support in alps ess or not])
|
||||
|
||||
AS_IF([test "$orte_mca_ess_alps_happy" = "yes" -a "$orte_without_full_support" = 0],
|
||||
[$1],
|
||||
[$2])
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -69,24 +69,10 @@ orte_ess_alps_component_open(void)
|
||||
|
||||
int orte_ess_alps_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
|
||||
/* we only build if CNOS support is available, so select us */
|
||||
*priority = 35;
|
||||
*module = (mca_base_module_t *)&orte_ess_alps_module;
|
||||
return ORTE_SUCCESS;
|
||||
#else
|
||||
/* if i'm a daemon, then only i can safely select this component if
|
||||
* PMI_GNI_LOC_ADDR exists */
|
||||
if (NULL != getenv("PMI_GNI_LOC_ADDR") &&
|
||||
ORTE_PROC_IS_DAEMON) {
|
||||
*priority = 35;
|
||||
*module = (mca_base_module_t *)&orte_ess_alps_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* can't be selected, so disqualify myself */
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -23,13 +23,11 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
|
||||
#if defined(HAVE_CNOS_MPI_OS_H)
|
||||
#include "cnos_mpi_os.h"
|
||||
#elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H)
|
||||
#include "catamount/cnos_mpi_os.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -71,40 +69,6 @@ orte_ess_base_module_t orte_ess_alps_module = {
|
||||
/* Local variables */
|
||||
static orte_vpid_t starting_vpid = 0;
|
||||
|
||||
static int
|
||||
get_vpid(orte_vpid_t *outvp,
|
||||
orte_vpid_t start_vpid)
|
||||
{
|
||||
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
|
||||
*outvp = (orte_vpid_t)cnos_get_rank() + start_vpid;
|
||||
return ORTE_SUCCESS;
|
||||
#else
|
||||
/* Cray XE6 Notes:
|
||||
* using PMI_GNI_LOC_ADDR to set vpid.
|
||||
*/
|
||||
int rank = 0;
|
||||
char *env;
|
||||
|
||||
if (NULL == (env = getenv("PMI_GNI_LOC_ADDR"))) {
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"PMI_GNI_LOC_ADDR not found, cannot continue\n"));
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
errno = 0;
|
||||
rank = (int)strtol(env, (char **)NULL, 10);
|
||||
if (0 != errno) {
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"strtol error detected at %s:%d\n", __FILE__,
|
||||
__LINE__));
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
*outvp = (orte_vpid_t)(rank + (int)start_vpid);
|
||||
return ORTE_SUCCESS;
|
||||
#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */
|
||||
}
|
||||
|
||||
static int rte_init(void)
|
||||
{
|
||||
int ret, i;
|
||||
@ -228,7 +192,6 @@ static int alps_set_name(void)
|
||||
int rc;
|
||||
orte_jobid_t jobid;
|
||||
char *tmp;
|
||||
orte_vpid_t vpid;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:alps setting name"));
|
||||
@ -258,13 +221,8 @@ static int alps_set_name(void)
|
||||
}
|
||||
free(tmp);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t)cnos_get_rank() + starting_vpid;
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,
|
||||
orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All
|
||||
* rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -103,12 +103,10 @@ static bool pmi_startup(void)
|
||||
|
||||
static int pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* for now, only use PMI when direct launched */
|
||||
if (!ORTE_PROC_IS_HNP &&
|
||||
NULL == orte_process_info.my_hnp_uri &&
|
||||
pmi_startup()) {
|
||||
/* we are available anywhere PMI is available, but not for HNP itself */
|
||||
if (!ORTE_PROC_IS_HNP && pmi_startup()) {
|
||||
/* if PMI is available, use it */
|
||||
*priority = 100;
|
||||
*priority = 40;
|
||||
*module = (mca_base_module_t *)&orte_ess_pmi_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -96,6 +98,8 @@ static int rte_init(void)
|
||||
orte_jmap_t *jmap;
|
||||
orte_pmap_t *pmap;
|
||||
int *ranks;
|
||||
char *tmp;
|
||||
orte_jobid_t jobid;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -113,6 +117,44 @@ static int rte_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ORTE_PROC_IS_DAEMON) { /* I am a daemon, launched by mpirun */
|
||||
/* we had to be given a jobid */
|
||||
mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
|
||||
true, false, NULL, &tmp);
|
||||
if (NULL == tmp) {
|
||||
error = "missing jobid";
|
||||
ret = ORTE_ERR_FATAL;
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "convert jobid";
|
||||
goto error;
|
||||
}
|
||||
free(tmp);
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_orted_setup";
|
||||
goto error;
|
||||
}
|
||||
/* get our rank from PMI */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_rank");
|
||||
error = "could not get PMI rank";
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = i + 1; /* compensate for orterun */
|
||||
|
||||
/* get the number of procs from PMI */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = i + 1; /* compensate for orterun */
|
||||
|
||||
} else { /* we are a direct-launched MPI process */
|
||||
/* get our PMI id length */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) {
|
||||
error = "PMI_Get_id_length_max";
|
||||
@ -124,7 +166,6 @@ static int rte_init(void)
|
||||
error = "PMI_Get_kvs_domain_id";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* PMI is very nice to us - the domain id is an integer followed
|
||||
* by a '.', followed by essentially a stepid. The first integer
|
||||
* defines an overall job number. The second integer is the number of
|
||||
@ -144,6 +185,7 @@ static int rte_init(void)
|
||||
|
||||
/* now build the jobid */
|
||||
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
|
||||
|
||||
/* get our rank */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_rank");
|
||||
@ -151,8 +193,14 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = i;
|
||||
/* complete definition of process name */
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
/* get the number of procs from PMI */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = i;
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them -
|
||||
* we can use the jobfam and stepid as unique keys
|
||||
@ -174,18 +222,6 @@ static int rte_init(void)
|
||||
free(cs_env);
|
||||
free(string_key);
|
||||
|
||||
/* get the number of procs */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = i;
|
||||
/* set max procs */
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* get our app_context number */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_appnum(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_appnum");
|
||||
@ -265,6 +301,15 @@ static int rte_init(void)
|
||||
error = "orte_ess_base_app_setup";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/* complete definition of process name */
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
/* set max procs */
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* flag that we completed init */
|
||||
app_init_complete = true;
|
||||
@ -286,18 +331,24 @@ static int rte_finalize(void)
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
if (app_init_complete) {
|
||||
/* use the default procedure to finish */
|
||||
/* if I am a daemon, finalize using the default procedure */
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
} else {
|
||||
/* use the default app procedure to finish */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
}
|
||||
|
||||
/* remove the envars that we pushed into environ
|
||||
* so we leave that structure intact
|
||||
*/
|
||||
unsetenv("OMPI_MCA_grpcomm");
|
||||
unsetenv("OMPI_MCA_routed");
|
||||
unsetenv("OMPI_MCA_orte_precondition_transports");
|
||||
}
|
||||
}
|
||||
|
||||
/* deconstruct my nidmap and jobmap arrays - this
|
||||
* function protects itself from being called
|
||||
|
@ -72,7 +72,7 @@ int orte_ess_slurm_component_query(mca_base_module_t **module, int *priority)
|
||||
/* Are we running under a SLURM job? Were
|
||||
* we given a path back to the HNP? If the
|
||||
* answer to both is "yes", then we were launched
|
||||
* by mpirun in a slurm world
|
||||
* by mpirun in a slurm world, so make ourselves available
|
||||
*/
|
||||
|
||||
if (NULL != getenv("SLURM_JOBID") &&
|
||||
|
@ -73,7 +73,6 @@ static int rte_init(void)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
char **hosts = NULL;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -88,19 +87,11 @@ static int rte_init(void)
|
||||
* default procedure
|
||||
*/
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
if (NULL != orte_node_regex) {
|
||||
/* extract the nodes */
|
||||
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
|
||||
error = "orte_regex_extract_node_names";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_orted_setup";
|
||||
goto error;
|
||||
}
|
||||
opal_argv_free(hosts);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -116,22 +107,9 @@ static int rte_init(void)
|
||||
|
||||
}
|
||||
|
||||
/* otherwise, I must be an application process - use
|
||||
* the default procedure to finish my setup
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_app_setup";
|
||||
goto error;
|
||||
}
|
||||
/* setup the nidmap arrays */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
/* no other options are supported! */
|
||||
error = "ess_error";
|
||||
ret = ORTE_ERROR;
|
||||
|
||||
error:
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
|
@ -310,7 +310,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"alps",
|
||||
NULL,
|
||||
&proc_vpid_index,
|
||||
nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
|
@ -815,9 +815,11 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
}
|
||||
|
||||
/* tell the orted what ESS component to use */
|
||||
if (NULL != ess) {
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
opal_argv_append(argc, argv, "ess");
|
||||
opal_argv_append(argc, argv, ess);
|
||||
}
|
||||
|
||||
/* pass the daemon jobid */
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
|
@ -63,7 +63,6 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
@ -328,7 +327,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurm", &proc_vpid_index,
|
||||
NULL, &proc_vpid_index,
|
||||
nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user