1
1

For cases where the alpha+non-zero prefix must be removed from a node name, be sure to do it everywhere we access node names - otherwise, modex methods such as pmi will fail to correctly identify procs on the same node

This commit was SVN r27022.
Этот коммит содержится в:
Ralph Castain 2012-08-13 20:44:56 +00:00
родитель 360dcd5f50
Коммит b9b41d8662
7 изменённых файлов: 67 добавлений и 49 удалений

Просмотреть файл

@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
orte_allocation_required = 1 orte_allocation_required = 1
## Deal with the allocator ## Deal with the allocator
plm_base_strip_prefix_from_node_names = 1 orte_strip_prefix_from_node_names = 1
## MPI behavior ## MPI behavior
## Do NOT specify mpi_leave_pinned so system ## Do NOT specify mpi_leave_pinned so system

Просмотреть файл

@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
orte_allocation_required = 1 orte_allocation_required = 1
## Deal with the allocator ## Deal with the allocator
plm_base_strip_prefix_from_node_names = 1 orte_strip_prefix_from_node_names = 1
## MPI behavior ## MPI behavior
## Do NOT specify mpi_leave_pinned so system ## Do NOT specify mpi_leave_pinned so system

Просмотреть файл

@ -620,7 +620,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
nodename = ptr; nodename = ptr;
} }
} }
if (orte_plm_globals.strip_prefix_from_node_names) { if (orte_process_info.strip_prefix_from_node_names) {
/* remove all leading characters and zeroes */ /* remove all leading characters and zeroes */
ptr = nodename; ptr = nodename;
while (idx < (int)strlen(nodename) && while (idx < (int)strlen(nodename) &&

Просмотреть файл

@ -89,8 +89,6 @@ orte_plm_base_module_t orte_plm = {
*/ */
int orte_plm_base_open(void) int orte_plm_base_open(void)
{ {
int value;
/* Debugging / verbose output. Always have stream open, with /* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */ verbose set by the mca open system... */
orte_plm_globals.output = opal_output_open(NULL); orte_plm_globals.output = opal_output_open(NULL);
@ -104,11 +102,6 @@ int orte_plm_base_open(void)
/* default to assigning daemons to nodes at launch */ /* default to assigning daemons to nodes at launch */
orte_plm_globals.daemon_nodes_assigned_at_launch = true; orte_plm_globals.daemon_nodes_assigned_at_launch = true;
mca_base_param_reg_int_name("plm", "base_strip_prefix_from_node_names",
"Whether to strip leading characters and zeroes from node names returned by daemons",
false, false, (int)false, &value);
orte_plm_globals.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(value);
/* Open up all the components that we can find */ /* Open up all the components that we can find */
if (ORTE_SUCCESS != if (ORTE_SUCCESS !=

Просмотреть файл

@ -58,8 +58,6 @@ typedef struct {
opal_buffer_t tree_spawn_cmd; opal_buffer_t tree_spawn_cmd;
/* daemon nodes assigned at launch */ /* daemon nodes assigned at launch */
bool daemon_nodes_assigned_at_launch; bool daemon_nodes_assigned_at_launch;
/* handle allocator-to-actual nodename matches */
bool strip_prefix_from_node_names;
} orte_plm_globals_t; } orte_plm_globals_t;
/** /**
* Global instance of PLM framework data * Global instance of PLM framework data

Просмотреть файл

@ -31,6 +31,7 @@
#ifdef HAVE_SYS_TYPES_H #ifdef HAVE_SYS_TYPES_H
#include <sys/types.h> #include <sys/types.h>
#endif #endif
#include <ctype.h>
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
@ -76,7 +77,8 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .app_rank = */ -1, /* .app_rank = */ -1,
/* .peer_modex = */ -1, /* .peer_modex = */ -1,
/* .peer_init_barrier = */ -1, /* .peer_init_barrier = */ -1,
/* .peer_fini_barrier = */ -1 /* .peer_fini_barrier = */ -1,
/* .strip_prefix_from_node_names = */ false
}; };
static bool init=false; static bool init=false;
@ -84,7 +86,7 @@ static bool init=false;
int orte_proc_info(void) int orte_proc_info(void)
{ {
int tmp; int tmp, idx;
char *uri, *ptr; char *uri, *ptr;
char hostname[ORTE_MAX_HOSTNAME_SIZE]; char hostname[ORTE_MAX_HOSTNAME_SIZE];
@ -140,9 +142,33 @@ int orte_proc_info(void)
/* get the process id */ /* get the process id */
orte_process_info.pid = getpid(); orte_process_info.pid = getpid();
mca_base_param_reg_int_name("orte", "strip_prefix_from_node_names",
"Whether to strip leading characters and zeroes from node names returned by daemons",
false, false, (int)false, &tmp);
orte_process_info.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(tmp);
/* get the nodename */ /* get the nodename */
gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE); gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
/* we have to strip node names here, if user directs, to ensure that
* the names exchanged in the modex match the names found locally
*/
if (orte_process_info.strip_prefix_from_node_names) {
/* remove all leading characters and zeroes */
idx = 0;
while (idx < (int)strlen(hostname) &&
(hostname[idx] <= '0' || '9' < hostname[idx])) {
idx++;
}
if ((int)strlen(hostname) <= idx) {
/* there were no non-zero numbers in the name */
orte_process_info.nodename = strdup(hostname); orte_process_info.nodename = strdup(hostname);
} else {
orte_process_info.nodename = strdup(&hostname[idx]);
}
} else {
orte_process_info.nodename = strdup(hostname);
}
opal_output(0, "HOSTNAME: %s", orte_process_info.nodename);
/* get the number of nodes in the job */ /* get the number of nodes in the job */
mca_base_param_reg_int_name("orte", "num_nodes", mca_base_param_reg_int_name("orte", "num_nodes",

Просмотреть файл

@ -127,6 +127,7 @@ struct orte_proc_info_t {
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */ orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */ orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
bool strip_prefix_from_node_names;
}; };
typedef struct orte_proc_info_t orte_proc_info_t; typedef struct orte_proc_info_t orte_proc_info_t;