Allow daemons to use PMI to get their name where PMI support is available while using the standard grpcomm and other capabilities. Remove the GNI code from the alps ess component as that component should only be for alps/cnos installations.
This commit was SVN r25737.
Этот коммит содержится в:
родитель
6235a355de
Коммит
9d556e2f17
@ -26,7 +26,7 @@
|
||||
# than the CNOS component to ensure we don't get both
|
||||
# since the ALPS component will -only- build if specifically
|
||||
# ordered to do so - which means we don't want the CNOS one
|
||||
AC_DEFUN([MCA_orte_ess_alps_PRIORITY], [10])
|
||||
AC_DEFUN([MCA_orte_ess_alps_PRIORITY], [15])
|
||||
|
||||
# MCA_ess_alps_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
@ -53,10 +53,6 @@ AC_DEFUN([MCA_orte_ess_alps_CONFIG],[
|
||||
[orte_mca_ess_alps_happy="yes"],
|
||||
[orte_mca_ess_alps_happy="no"])
|
||||
|
||||
AC_DEFINE_UNQUOTED([ORTE_MCA_ESS_ALPS_HAVE_CNOS],
|
||||
[$orte_mca_ess_alps_have_cnos],
|
||||
[Whether we have CNOS support in alps ess or not])
|
||||
|
||||
AS_IF([test "$orte_mca_ess_alps_happy" = "yes" -a "$orte_without_full_support" = 0],
|
||||
[$1],
|
||||
[$2])
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -69,24 +69,10 @@ orte_ess_alps_component_open(void)
|
||||
|
||||
int orte_ess_alps_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
|
||||
/* we only build if CNOS support is available, so select us */
|
||||
*priority = 35;
|
||||
*module = (mca_base_module_t *)&orte_ess_alps_module;
|
||||
return ORTE_SUCCESS;
|
||||
#else
|
||||
/* if i'm a daemon, then only i can safely select this component if
|
||||
* PMI_GNI_LOC_ADDR exists */
|
||||
if (NULL != getenv("PMI_GNI_LOC_ADDR") &&
|
||||
ORTE_PROC_IS_DAEMON) {
|
||||
*priority = 35;
|
||||
*module = (mca_base_module_t *)&orte_ess_alps_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* can't be selected, so disqualify myself */
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -23,12 +23,10 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
|
||||
# if defined(HAVE_CNOS_MPI_OS_H)
|
||||
# include "cnos_mpi_os.h"
|
||||
# elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H)
|
||||
# include "catamount/cnos_mpi_os.h"
|
||||
# endif
|
||||
#if defined(HAVE_CNOS_MPI_OS_H)
|
||||
#include "cnos_mpi_os.h"
|
||||
#elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H)
|
||||
#include "catamount/cnos_mpi_os.h"
|
||||
#endif
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
@ -71,40 +69,6 @@ orte_ess_base_module_t orte_ess_alps_module = {
|
||||
/* Local variables */
|
||||
static orte_vpid_t starting_vpid = 0;
|
||||
|
||||
static int
|
||||
get_vpid(orte_vpid_t *outvp,
|
||||
orte_vpid_t start_vpid)
|
||||
{
|
||||
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
|
||||
*outvp = (orte_vpid_t)cnos_get_rank() + start_vpid;
|
||||
return ORTE_SUCCESS;
|
||||
#else
|
||||
/* Cray XE6 Notes:
|
||||
* using PMI_GNI_LOC_ADDR to set vpid.
|
||||
*/
|
||||
int rank = 0;
|
||||
char *env;
|
||||
|
||||
if (NULL == (env = getenv("PMI_GNI_LOC_ADDR"))) {
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"PMI_GNI_LOC_ADDR not found, cannot continue\n"));
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
errno = 0;
|
||||
rank = (int)strtol(env, (char **)NULL, 10);
|
||||
if (0 != errno) {
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"strtol error detected at %s:%d\n", __FILE__,
|
||||
__LINE__));
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
*outvp = (orte_vpid_t)(rank + (int)start_vpid);
|
||||
return ORTE_SUCCESS;
|
||||
#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */
|
||||
}
|
||||
|
||||
static int rte_init(void)
|
||||
{
|
||||
int ret, i;
|
||||
@ -228,7 +192,6 @@ static int alps_set_name(void)
|
||||
int rc;
|
||||
orte_jobid_t jobid;
|
||||
char *tmp;
|
||||
orte_vpid_t vpid;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:alps setting name"));
|
||||
@ -258,13 +221,8 @@ static int alps_set_name(void)
|
||||
}
|
||||
free(tmp);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t)cnos_get_rank() + starting_vpid;
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,
|
||||
orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All
|
||||
* rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -103,12 +103,10 @@ static bool pmi_startup(void)
|
||||
|
||||
static int pmi_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* for now, only use PMI when direct launched */
|
||||
if (!ORTE_PROC_IS_HNP &&
|
||||
NULL == orte_process_info.my_hnp_uri &&
|
||||
pmi_startup()) {
|
||||
/* we are available anywhere PMI is available, but not for HNP itself */
|
||||
if (!ORTE_PROC_IS_HNP && pmi_startup()) {
|
||||
/* if PMI is available, use it */
|
||||
*priority = 100;
|
||||
*priority = 40;
|
||||
*module = (mca_base_module_t *)&orte_ess_pmi_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -96,6 +98,8 @@ static int rte_init(void)
|
||||
orte_jmap_t *jmap;
|
||||
orte_pmap_t *pmap;
|
||||
int *ranks;
|
||||
char *tmp;
|
||||
orte_jobid_t jobid;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -113,165 +117,206 @@ static int rte_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* get our PMI id length */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) {
|
||||
error = "PMI_Get_id_length_max";
|
||||
goto error;
|
||||
}
|
||||
pmi_id = malloc(pmi_maxlen);
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) {
|
||||
if (ORTE_PROC_IS_DAEMON) { /* I am a daemon, launched by mpirun */
|
||||
/* we had to be given a jobid */
|
||||
mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
|
||||
true, false, NULL, &tmp);
|
||||
if (NULL == tmp) {
|
||||
error = "missing jobid";
|
||||
ret = ORTE_ERR_FATAL;
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "convert jobid";
|
||||
goto error;
|
||||
}
|
||||
free(tmp);
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_orted_setup";
|
||||
goto error;
|
||||
}
|
||||
/* get our rank from PMI */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_rank");
|
||||
error = "could not get PMI rank";
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = i + 1; /* compensate for orterun */
|
||||
|
||||
/* get the number of procs from PMI */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = i + 1; /* compensate for orterun */
|
||||
|
||||
} else { /* we are a direct-launched MPI process */
|
||||
/* get our PMI id length */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) {
|
||||
error = "PMI_Get_id_length_max";
|
||||
goto error;
|
||||
}
|
||||
pmi_id = malloc(pmi_maxlen);
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) {
|
||||
free(pmi_id);
|
||||
error = "PMI_Get_kvs_domain_id";
|
||||
goto error;
|
||||
}
|
||||
/* PMI is very nice to us - the domain id is an integer followed
|
||||
* by a '.', followed by essentially a stepid. The first integer
|
||||
* defines an overall job number. The second integer is the number of
|
||||
* individual jobs we have run within that allocation. So we translate
|
||||
* this as the overall job number equating to our job family, and
|
||||
* the individual number equating to our local jobid
|
||||
*/
|
||||
jobfam = strtol(pmi_id, &localj, 10);
|
||||
if (NULL == localj) {
|
||||
/* hmmm - no '.', so let's just use zero */
|
||||
stepid = 0;
|
||||
} else {
|
||||
localj++; /* step over the '.' */
|
||||
stepid = strtol(localj, NULL, 10) + 1; /* add one to avoid looking like a daemon */
|
||||
}
|
||||
free(pmi_id);
|
||||
error = "PMI_Get_kvs_domain_id";
|
||||
goto error;
|
||||
|
||||
/* now build the jobid */
|
||||
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
|
||||
|
||||
/* get our rank */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_rank");
|
||||
error = "could not get PMI rank";
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = i;
|
||||
|
||||
/* get the number of procs from PMI */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = i;
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them -
|
||||
* we can use the jobfam and stepid as unique keys
|
||||
* because they are unique values assigned by the RM
|
||||
*/
|
||||
unique_key[0] = (uint64_t)jobfam;
|
||||
unique_key[1] = (uint64_t)stepid;
|
||||
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
asprintf(&envar, "%s=%s", cs_env, string_key);
|
||||
putenv(envar);
|
||||
/* cannot free the envar as that messes up our environ */
|
||||
free(cs_env);
|
||||
free(string_key);
|
||||
|
||||
/* get our app_context number */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_appnum(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_appnum");
|
||||
error = "could not get PMI appnum";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.app_num = i;
|
||||
|
||||
/* setup the nidmap arrays - they will be filled by the modex */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* initialize our entry */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_setup_local_nidmap_entries";
|
||||
goto error;
|
||||
}
|
||||
/* correct the daemon entry on our nidmap object - note that
|
||||
* each proc's nidmap will be different, but the only thing that
|
||||
* matters here (since we are not routing messages) is that
|
||||
* we know which procs are on the same nodes
|
||||
*/
|
||||
nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, 0);
|
||||
nid->daemon = 0;
|
||||
/* setup my daemon's name - arbitrary, since we don't route
|
||||
* messages
|
||||
*/
|
||||
ORTE_PROC_MY_DAEMON->jobid = 0;
|
||||
ORTE_PROC_MY_DAEMON->vpid = 0;
|
||||
|
||||
/* get the job map for this job */
|
||||
jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, 0);
|
||||
/* update the num procs */
|
||||
jmap->num_procs = orte_process_info.num_procs;
|
||||
/* set the size of the pidmap storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "could not set array size for pidmap";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* get my pidmap entry */
|
||||
pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid);
|
||||
|
||||
/* get our local proc info to find our local rank */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_clique_size");
|
||||
error = "could not get PMI clique size";
|
||||
goto error;
|
||||
}
|
||||
ranks = (int*)malloc(i * sizeof(int));
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_clique_ranks");
|
||||
error = "could not get clique ranks";
|
||||
goto error;
|
||||
}
|
||||
/* cycle thru the array until we find our rank */
|
||||
for (j=0; j < i; j++) {
|
||||
if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) {
|
||||
pmap->local_rank = j;
|
||||
pmap->node_rank = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(ranks);
|
||||
|
||||
/* ensure we pick the correct critical components */
|
||||
putenv("OMPI_MCA_grpcomm=pmi");
|
||||
putenv("OMPI_MCA_routed=direct");
|
||||
|
||||
/* now use the default procedure to finish my setup */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_app_setup";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/* PMI is very nice to us - the domain id is an integer followed
|
||||
* by a '.', followed by essentially a stepid. The first integer
|
||||
* defines an overall job number. The second integer is the number of
|
||||
* individual jobs we have run within that allocation. So we translate
|
||||
* this as the overall job number equating to our job family, and
|
||||
* the individual number equating to our local jobid
|
||||
*/
|
||||
jobfam = strtol(pmi_id, &localj, 10);
|
||||
if (NULL == localj) {
|
||||
/* hmmm - no '.', so let's just use zero */
|
||||
stepid = 0;
|
||||
} else {
|
||||
localj++; /* step over the '.' */
|
||||
stepid = strtol(localj, NULL, 10) + 1; /* add one to avoid looking like a daemon */
|
||||
}
|
||||
free(pmi_id);
|
||||
|
||||
/* now build the jobid */
|
||||
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
|
||||
/* get our rank */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_rank");
|
||||
error = "could not get PMI rank";
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = i;
|
||||
/* complete definition of process name */
|
||||
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them -
|
||||
* we can use the jobfam and stepid as unique keys
|
||||
* because they are unique values assigned by the RM
|
||||
*/
|
||||
unique_key[0] = (uint64_t)jobfam;
|
||||
unique_key[1] = (uint64_t)stepid;
|
||||
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
asprintf(&envar, "%s=%s", cs_env, string_key);
|
||||
putenv(envar);
|
||||
/* cannot free the envar as that messes up our environ */
|
||||
free(cs_env);
|
||||
free(string_key);
|
||||
|
||||
/* get the number of procs */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
|
||||
error = "could not get PMI universe size";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = i;
|
||||
/* set max procs */
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* get our app_context number */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_appnum(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_appnum");
|
||||
error = "could not get PMI appnum";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.app_num = i;
|
||||
|
||||
/* setup the nidmap arrays - they will be filled by the modex */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* initialize our entry */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_setup_local_nidmap_entries";
|
||||
goto error;
|
||||
}
|
||||
/* correct the daemon entry on our nidmap object - note that
|
||||
* each proc's nidmap will be different, but the only thing that
|
||||
* matters here (since we are not routing messages) is that
|
||||
* we know which procs are on the same nodes
|
||||
*/
|
||||
nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, 0);
|
||||
nid->daemon = 0;
|
||||
/* setup my daemon's name - arbitrary, since we don't route
|
||||
* messages
|
||||
*/
|
||||
ORTE_PROC_MY_DAEMON->jobid = 0;
|
||||
ORTE_PROC_MY_DAEMON->vpid = 0;
|
||||
|
||||
/* get the job map for this job */
|
||||
jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, 0);
|
||||
/* update the num procs */
|
||||
jmap->num_procs = orte_process_info.num_procs;
|
||||
/* set the size of the pidmap storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "could not set array size for pidmap";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* get my pidmap entry */
|
||||
pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid);
|
||||
|
||||
/* get our local proc info to find our local rank */
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_clique_size");
|
||||
error = "could not get PMI clique size";
|
||||
goto error;
|
||||
}
|
||||
ranks = (int*)malloc(i * sizeof(int));
|
||||
if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, i))) {
|
||||
ORTE_PMI_ERROR(ret, "PMI_Get_clique_ranks");
|
||||
error = "could not get clique ranks";
|
||||
goto error;
|
||||
}
|
||||
/* cycle thru the array until we find our rank */
|
||||
for (j=0; j < i; j++) {
|
||||
if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) {
|
||||
pmap->local_rank = j;
|
||||
pmap->node_rank = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(ranks);
|
||||
|
||||
/* ensure we pick the correct critical components */
|
||||
putenv("OMPI_MCA_grpcomm=pmi");
|
||||
putenv("OMPI_MCA_routed=direct");
|
||||
|
||||
/* now use the default procedure to finish my setup */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_app_setup";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* flag that we completed init */
|
||||
app_init_complete = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
error:
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
@ -286,19 +331,25 @@ static int rte_finalize(void)
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
if (app_init_complete) {
|
||||
/* use the default procedure to finish */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
/* if I am a daemon, finalize using the default procedure */
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
} else {
|
||||
/* use the default app procedure to finish */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
/* remove the envars that we pushed into environ
|
||||
* so we leave that structure intact
|
||||
*/
|
||||
unsetenv("OMPI_MCA_grpcomm");
|
||||
unsetenv("OMPI_MCA_routed");
|
||||
unsetenv("OMPI_MCA_orte_precondition_transports");
|
||||
}
|
||||
}
|
||||
|
||||
/* remove the envars that we pushed into environ
|
||||
* so we leave that structure intact
|
||||
*/
|
||||
unsetenv("OMPI_MCA_grpcomm");
|
||||
unsetenv("OMPI_MCA_routed");
|
||||
unsetenv("OMPI_MCA_orte_precondition_transports");
|
||||
|
||||
/* deconstruct my nidmap and jobmap arrays - this
|
||||
* function protects itself from being called
|
||||
* before things were initialized
|
||||
|
@ -72,7 +72,7 @@ int orte_ess_slurm_component_query(mca_base_module_t **module, int *priority)
|
||||
/* Are we running under a SLURM job? Were
|
||||
* we given a path back to the HNP? If the
|
||||
* answer to both is "yes", then we were launched
|
||||
* by mpirun in a slurm world
|
||||
* by mpirun in a slurm world, so make ourselves available
|
||||
*/
|
||||
|
||||
if (NULL != getenv("SLURM_JOBID") &&
|
||||
|
@ -73,7 +73,6 @@ static int rte_init(void)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
char **hosts = NULL;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -88,19 +87,11 @@ static int rte_init(void)
|
||||
* default procedure
|
||||
*/
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
if (NULL != orte_node_regex) {
|
||||
/* extract the nodes */
|
||||
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
|
||||
error = "orte_regex_extract_node_names";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_orted_setup";
|
||||
goto error;
|
||||
}
|
||||
opal_argv_free(hosts);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -116,23 +107,10 @@ static int rte_init(void)
|
||||
|
||||
}
|
||||
|
||||
/* otherwise, I must be an application process - use
|
||||
* the default procedure to finish my setup
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_app_setup";
|
||||
goto error;
|
||||
}
|
||||
/* setup the nidmap arrays */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* no other options are supported! */
|
||||
error = "ess_error";
|
||||
ret = ORTE_ERROR;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
|
@ -310,7 +310,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"alps",
|
||||
NULL,
|
||||
&proc_vpid_index,
|
||||
nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
|
@ -815,9 +815,11 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
}
|
||||
|
||||
/* tell the orted what ESS component to use */
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
opal_argv_append(argc, argv, "ess");
|
||||
opal_argv_append(argc, argv, ess);
|
||||
if (NULL != ess) {
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
opal_argv_append(argc, argv, "ess");
|
||||
opal_argv_append(argc, argv, ess);
|
||||
}
|
||||
|
||||
/* pass the daemon jobid */
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
|
@ -63,7 +63,6 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
@ -328,7 +327,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurm", &proc_vpid_index,
|
||||
NULL, &proc_vpid_index,
|
||||
nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user