1
1

add PMI support to ess alps module. xt system guys: please yell at me if i missed something in cnos.

This commit was SVN r25423.
Этот коммит содержится в:
Samuel Gutierrez 2011-11-03 04:04:32 +00:00
родитель 27b9bcfafd
Коммит 3fe7b3ee54

Просмотреть файл

@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,11 +23,15 @@
#include "orte_config.h"
#include "orte/constants.h"
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
# if defined(HAVE_CNOS_MPI_OS_H)
# include "cnos_mpi_os.h"
# elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H)
# include "catamount/cnos_mpi_os.h"
# endif
#elif ORTE_MCA_ESS_ALPS_HAVE_PMI == 1
# include "pmi.h"
#endif
#include "orte/util/show_help.h"
#include "opal/util/argv.h"
@ -42,9 +48,10 @@
#include "orte/mca/ess/alps/ess_alps.h"
static int alps_set_name(void);
static int rte_init(void);
static int rte_finalize(void);
static int get_vpid(orte_vpid_t *outvp,
orte_vpid_t start_vpid);
orte_ess_base_module_t orte_ess_alps_module = {
rte_init,
@ -61,12 +68,41 @@ orte_ess_base_module_t orte_ess_alps_module = {
NULL /* ft_event */
};
/*
* Local variables
*/
static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID;
/* Local variables */
static orte_vpid_t starting_vpid = 0;
static int
get_vpid(orte_vpid_t *outvp,
orte_vpid_t start_vpid)
{
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
*outvp = (orte_vpid_t)cnos_get_rank() + start_vpid;
return ORTE_SUCCESS;
#else /* using PMI */
/* TODO SKG - PMI utility functions should be in a common area */
int rank;
PMI_BOOL pmi_initialized;
if (PMI_SUCCESS != PMI_Initialized(&pmi_initialized)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
if (PMI_FALSE == pmi_initialized) {
int tmp;
if (PMI_SUCCESS != PMI_Init(&tmp)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
}
if (PMI_SUCCESS != PMI_Get_rank(&rank)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
*outvp = (orte_vpid_t)(rank + (int)start_vpid);
return ORTE_SUCCESS;
#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */
}
static int rte_init(void)
{
int ret, i;
@ -88,7 +124,8 @@ static int rte_init(void)
if (ORTE_PROC_IS_DAEMON) {
if (NULL != orte_node_regex) {
/* extract the nodes */
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
if (ORTE_SUCCESS != (ret =
orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
NULL == hosts) {
error = "orte_regex_extract_node_names";
goto error;
@ -96,7 +133,8 @@ static int rte_init(void)
/* find our host in the list */
for (i=0; NULL != hosts[i]; i++) {
if (0 == strncmp(hosts[i], orte_process_info.nodename, strlen(hosts[i]))) {
if (0 == strncmp(hosts[i], orte_process_info.nodename,
strlen(hosts[i]))) {
/* correct our vpid */
ORTE_PROC_MY_NAME->vpid = starting_vpid + i;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
@ -106,7 +144,6 @@ static int rte_init(void)
}
}
}
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_orted_setup";
@ -115,7 +152,6 @@ static int rte_init(void)
opal_argv_free(hosts);
return ORTE_SUCCESS;
}
if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
@ -126,7 +162,6 @@ static int rte_init(void)
/* as a tool, I don't need a nidmap - so just return now */
return ORTE_SUCCESS;
}
/* otherwise, I must be an application process - use
* the default procedure to finish my setup
*/
@ -135,9 +170,9 @@ static int rte_init(void)
error = "orte_ess_base_app_setup";
goto error;
}
/* setup the nidmap arrays */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
if (ORTE_SUCCESS !=
(ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init";
goto error;
@ -149,7 +184,6 @@ error:
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
return ret;
}
@ -178,7 +212,6 @@ static int rte_finalize(void)
}
}
/* deconstruct my nidmap and jobmap arrays */
orte_util_nidmap_finalize();
@ -190,6 +223,7 @@ static int alps_set_name(void)
int rc;
orte_jobid_t jobid;
char *tmp;
orte_vpid_t vpid;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps setting name"));
@ -202,7 +236,7 @@ static int alps_set_name(void)
}
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) {
ORTE_ERROR_LOG(rc);
return(rc);
return rc;
}
free(tmp);
@ -212,19 +246,27 @@ static int alps_set_name(void)
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid, tmp))) {
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid,
tmp))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
free(tmp);
if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,
orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
"ess:alps set name to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* get the num procs as provided in the cmd line param */
if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {