1
1

add PMI support to ess alps module. xt system guys: please yell at me if i missed something in cnos.

This commit was SVN r25423.
Этот коммит содержится в:
Samuel Gutierrez 2011-11-03 04:04:32 +00:00
родитель 27b9bcfafd
Коммит 3fe7b3ee54

Просмотреть файл

@ -5,15 +5,17 @@
* Copyright (c) 2004-2011 The University of Tennessee and The University * Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
* *
* $HEADER$ * $HEADER$
* *
*/ */
@ -21,10 +23,14 @@
#include "orte_config.h" #include "orte_config.h"
#include "orte/constants.h" #include "orte/constants.h"
#if defined(HAVE_CNOS_MPI_OS_H) #if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
# include "cnos_mpi_os.h" # if defined(HAVE_CNOS_MPI_OS_H)
#elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H) # include "cnos_mpi_os.h"
# include "catamount/cnos_mpi_os.h" # elif defined(HAVE_CATAMOUNT_CNOS_MPI_OS_H)
# include "catamount/cnos_mpi_os.h"
# endif
#elif ORTE_MCA_ESS_ALPS_HAVE_PMI == 1
# include "pmi.h"
#endif #endif
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
@ -42,9 +48,10 @@
#include "orte/mca/ess/alps/ess_alps.h" #include "orte/mca/ess/alps/ess_alps.h"
static int alps_set_name(void); static int alps_set_name(void);
static int rte_init(void); static int rte_init(void);
static int rte_finalize(void); static int rte_finalize(void);
static int get_vpid(orte_vpid_t *outvp,
orte_vpid_t start_vpid);
orte_ess_base_module_t orte_ess_alps_module = { orte_ess_base_module_t orte_ess_alps_module = {
rte_init, rte_init,
@ -61,11 +68,40 @@ orte_ess_base_module_t orte_ess_alps_module = {
NULL /* ft_event */ NULL /* ft_event */
}; };
/* /* Local variables */
* Local variables static orte_vpid_t starting_vpid = 0;
*/
static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID; static int
static orte_vpid_t starting_vpid=0; get_vpid(orte_vpid_t *outvp,
orte_vpid_t start_vpid)
{
#if ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1
*outvp = (orte_vpid_t)cnos_get_rank() + start_vpid;
return ORTE_SUCCESS;
#else /* using PMI */
/* TODO SKG - PMI utility functions should be in a common area */
int rank;
PMI_BOOL pmi_initialized;
if (PMI_SUCCESS != PMI_Initialized(&pmi_initialized)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
if (PMI_FALSE == pmi_initialized) {
int tmp;
if (PMI_SUCCESS != PMI_Init(&tmp)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
}
if (PMI_SUCCESS != PMI_Get_rank(&rank)) {
ORTE_ERROR_LOG(ORTE_ERROR);
return ORTE_ERROR;
}
*outvp = (orte_vpid_t)(rank + (int)start_vpid);
return ORTE_SUCCESS;
#endif /* ORTE_MCA_ESS_ALPS_HAVE_CNOS == 1 */
}
static int rte_init(void) static int rte_init(void)
{ {
@ -78,17 +114,18 @@ static int rte_init(void)
error = "orte_ess_base_std_prolog"; error = "orte_ess_base_std_prolog";
goto error; goto error;
} }
/* Start by getting a unique name */ /* Start by getting a unique name */
alps_set_name(); alps_set_name();
/* if I am a daemon, complete my setup using the /* if I am a daemon, complete my setup using the
* default procedure * default procedure
*/ */
if (ORTE_PROC_IS_DAEMON) { if (ORTE_PROC_IS_DAEMON) {
if (NULL != orte_node_regex) { if (NULL != orte_node_regex) {
/* extract the nodes */ /* extract the nodes */
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts)) || if (ORTE_SUCCESS != (ret =
orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
NULL == hosts) { NULL == hosts) {
error = "orte_regex_extract_node_names"; error = "orte_regex_extract_node_names";
goto error; goto error;
@ -96,7 +133,8 @@ static int rte_init(void)
/* find our host in the list */ /* find our host in the list */
for (i=0; NULL != hosts[i]; i++) { for (i=0; NULL != hosts[i]; i++) {
if (0 == strncmp(hosts[i], orte_process_info.nodename, strlen(hosts[i]))) { if (0 == strncmp(hosts[i], orte_process_info.nodename,
strlen(hosts[i]))) {
/* correct our vpid */ /* correct our vpid */
ORTE_PROC_MY_NAME->vpid = starting_vpid + i; ORTE_PROC_MY_NAME->vpid = starting_vpid + i;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
@ -106,7 +144,6 @@ static int rte_init(void)
} }
} }
} }
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
error = "orte_ess_base_orted_setup"; error = "orte_ess_base_orted_setup";
@ -115,7 +152,6 @@ static int rte_init(void)
opal_argv_free(hosts); opal_argv_free(hosts);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
if (ORTE_PROC_IS_TOOL) { if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */ /* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
@ -126,7 +162,6 @@ static int rte_init(void)
/* as a tool, I don't need a nidmap - so just return now */ /* as a tool, I don't need a nidmap - so just return now */
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* otherwise, I must be an application process - use /* otherwise, I must be an application process - use
* the default procedure to finish my setup * the default procedure to finish my setup
*/ */
@ -135,28 +170,27 @@ static int rte_init(void)
error = "orte_ess_base_app_setup"; error = "orte_ess_base_app_setup";
goto error; goto error;
} }
/* setup the nidmap arrays */ /* setup the nidmap arrays */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { if (ORTE_SUCCESS !=
(ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init"; error = "orte_util_nidmap_init";
goto error; goto error;
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
orte_show_help("help-orte-runtime.txt", orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure", "orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret); true, error, ORTE_ERROR_NAME(ret), ret);
return ret; return ret;
} }
static int rte_finalize(void) static int rte_finalize(void)
{ {
int ret; int ret;
/* if I am a daemon, finalize using the default procedure */ /* if I am a daemon, finalize using the default procedure */
if (ORTE_PROC_IS_DAEMON) { if (ORTE_PROC_IS_DAEMON) {
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
@ -177,23 +211,23 @@ static int rte_finalize(void)
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
} }
} }
/* deconstruct my nidmap and jobmap arrays */ /* deconstruct my nidmap and jobmap arrays */
orte_util_nidmap_finalize(); orte_util_nidmap_finalize();
return ret; return ret;
} }
static int alps_set_name(void) static int alps_set_name(void)
{ {
int rc; int rc;
orte_jobid_t jobid; orte_jobid_t jobid;
char* tmp; char *tmp;
orte_vpid_t vpid;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps setting name")); "ess:alps setting name"));
mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
true, false, NULL, &tmp); true, false, NULL, &tmp);
if (NULL == tmp) { if (NULL == tmp) {
@ -202,30 +236,38 @@ static int alps_set_name(void)
} }
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) { if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return(rc); return rc;
} }
free(tmp); free(tmp);
mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
true, false, NULL, &tmp); true, false, NULL, &tmp);
if (NULL == tmp) { if (NULL == tmp) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid, tmp))) { if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid,
tmp))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return(rc); return(rc);
} }
free(tmp); free(tmp);
if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid; ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,
orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); "ess:alps set name to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* get the num procs as provided in the cmd line param */ /* get the num procs as provided in the cmd line param */
if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);