1
1

Bring alps back to full operations by correctly computing daemon names. Unfortunately, alps doesn't assign cnos rank in node-based order - i.e., cnos rank=0 isn't necessarily on the first node of the execution. So adjust when using static ports.

Add some debug to nidmap

Ensure that the HNP's node name is not included in the regex when launching via rshbase as that node is automatically included in the daemon map.

This commit was SVN r25063.
Этот коммит содержится в:
Ralph Castain 2011-08-18 14:59:18 +00:00
родитель a2a20c3766
Коммит e58623cd5b
3 изменённых файлов: 37 добавлений и 20 удалений

Просмотреть файл

@ -73,11 +73,11 @@ orte_ess_base_module_t orte_ess_alps_module = {
* Local variables
*/
static orte_node_rank_t my_node_rank=ORTE_NODE_RANK_INVALID;
static orte_vpid_t starting_vpid=0;
static int rte_init(void)
{
int ret;
int ret, i;
char *error = NULL;
char **hosts = NULL;
@ -96,11 +96,24 @@ static int rte_init(void)
if (ORTE_PROC_IS_DAEMON) {
if (NULL != orte_node_regex) {
/* extract the nodes */
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
NULL == hosts) {
error = "orte_regex_extract_node_names";
goto error;
}
}
/* find our host in the list */
for (i=0; NULL != hosts[i]; i++) {
if (0 == strncmp(hosts[i], orte_process_info.nodename, strlen(hosts[i]))) {
/* correct our vpid */
ORTE_PROC_MY_NAME->vpid = starting_vpid + i;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps reset name to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
break;
}
}
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_orted_setup";
@ -319,7 +332,6 @@ static int alps_set_name(void)
{
int rc;
orte_jobid_t jobid;
orte_vpid_t starting_vpid;
char* tmp;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
@ -357,16 +369,11 @@ static int alps_set_name(void)
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* get my node rank in case we are using static ports - this won't
* be present for daemons, so don't error out if we don't have it
*/
mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank",
true, false, NULL, &tmp);
if (NULL != tmp) {
my_node_rank = strtol(tmp, NULL, 10);
/* get the num procs as provided in the cmd line param */
if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;

Просмотреть файл

@ -61,6 +61,7 @@
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/bit_ops.h"
#include "opal/util/if.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/show_help.h"
@ -328,13 +329,22 @@ static int spawn(orte_job_t *jdata)
* nodes so we can use a regex to pass connection info
*/
if (orte_static_ports) {
nodelist = NULL;
for (nnode=0; nnode < map->nodes->size; nnode++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
continue;
}
/* if this is me, then don't include it - I'm already present
* in the cmd line options
*/
if (0 == strcmp(node->name, orte_process_info.nodename) || opal_ifislocal(node->name)) {
continue;
}
opal_argv_append_nosize(&nodes, node->name);
}
nodelist = opal_argv_join(nodes, ',');
if (0 < opal_argv_count(nodes)) {
nodelist = opal_argv_join(nodes, ',');
}
opal_argv_free(nodes);
}

Просмотреть файл

@ -196,7 +196,8 @@ int orte_util_build_daemon_nidmap(char **nodes)
num_nodes = opal_argv_count(nodes);
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"orte:util:build:daemon:nidmap found %d nodes", num_nodes));
"%s orte:util:build:daemon:nidmap found %d nodes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes));
if (0 == num_nodes) {
/* nothing to do */
@ -240,11 +241,6 @@ int orte_util_build_daemon_nidmap(char **nodes)
}
addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
OPAL_OUTPUT_VERBOSE((3, orte_debug_output,
"%s orte:util:build:daemon:nidmap node %s daemon %d addr %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name, (int)node->daemon, addr));
/* since we are using static ports, all my fellow daemons will be on my
* port. Setup the contact info for each daemon in my hash tables. Note
* that this will -not- open a port to those daemons, but will only
@ -257,6 +253,10 @@ int orte_util_build_daemon_nidmap(char **nodes)
orte_util_convert_process_name_to_string(&proc_name, &proc);
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s orte:util:build:daemon:nidmap node %s daemon %d addr %s uri %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name, (int)node->daemon, addr, uri));
opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
free(proc_name);
free(uri);