For cases where the alpha+non-zero prefix must be removed from a node name, be sure to do it everywhere we access node names - otherwise, modex methods such as pmi will fail to correctly identify procs on the same node
This commit was SVN r27022.
Этот коммит содержится в:
родитель
360dcd5f50
Коммит
b9b41d8662
@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
|
|||||||
orte_allocation_required = 1
|
orte_allocation_required = 1
|
||||||
|
|
||||||
## Deal with the allocator
|
## Deal with the allocator
|
||||||
plm_base_strip_prefix_from_node_names = 1
|
orte_strip_prefix_from_node_names = 1
|
||||||
|
|
||||||
## MPI behavior
|
## MPI behavior
|
||||||
## Do NOT specify mpi_leave_pinned so system
|
## Do NOT specify mpi_leave_pinned so system
|
||||||
|
@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
|
|||||||
orte_allocation_required = 1
|
orte_allocation_required = 1
|
||||||
|
|
||||||
## Deal with the allocator
|
## Deal with the allocator
|
||||||
plm_base_strip_prefix_from_node_names = 1
|
orte_strip_prefix_from_node_names = 1
|
||||||
|
|
||||||
## MPI behavior
|
## MPI behavior
|
||||||
## Do NOT specify mpi_leave_pinned so system
|
## Do NOT specify mpi_leave_pinned so system
|
||||||
|
@ -620,7 +620,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
nodename = ptr;
|
nodename = ptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (orte_plm_globals.strip_prefix_from_node_names) {
|
if (orte_process_info.strip_prefix_from_node_names) {
|
||||||
/* remove all leading characters and zeroes */
|
/* remove all leading characters and zeroes */
|
||||||
ptr = nodename;
|
ptr = nodename;
|
||||||
while (idx < (int)strlen(nodename) &&
|
while (idx < (int)strlen(nodename) &&
|
||||||
|
@ -89,8 +89,6 @@ orte_plm_base_module_t orte_plm = {
|
|||||||
*/
|
*/
|
||||||
int orte_plm_base_open(void)
|
int orte_plm_base_open(void)
|
||||||
{
|
{
|
||||||
int value;
|
|
||||||
|
|
||||||
/* Debugging / verbose output. Always have stream open, with
|
/* Debugging / verbose output. Always have stream open, with
|
||||||
verbose set by the mca open system... */
|
verbose set by the mca open system... */
|
||||||
orte_plm_globals.output = opal_output_open(NULL);
|
orte_plm_globals.output = opal_output_open(NULL);
|
||||||
@ -104,11 +102,6 @@ int orte_plm_base_open(void)
|
|||||||
/* default to assigning daemons to nodes at launch */
|
/* default to assigning daemons to nodes at launch */
|
||||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||||
|
|
||||||
mca_base_param_reg_int_name("plm", "base_strip_prefix_from_node_names",
|
|
||||||
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
|
||||||
false, false, (int)false, &value);
|
|
||||||
orte_plm_globals.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(value);
|
|
||||||
|
|
||||||
/* Open up all the components that we can find */
|
/* Open up all the components that we can find */
|
||||||
|
|
||||||
if (ORTE_SUCCESS !=
|
if (ORTE_SUCCESS !=
|
||||||
|
@ -58,8 +58,6 @@ typedef struct {
|
|||||||
opal_buffer_t tree_spawn_cmd;
|
opal_buffer_t tree_spawn_cmd;
|
||||||
/* daemon nodes assigned at launch */
|
/* daemon nodes assigned at launch */
|
||||||
bool daemon_nodes_assigned_at_launch;
|
bool daemon_nodes_assigned_at_launch;
|
||||||
/* handle allocator-to-actual nodename matches */
|
|
||||||
bool strip_prefix_from_node_names;
|
|
||||||
} orte_plm_globals_t;
|
} orte_plm_globals_t;
|
||||||
/**
|
/**
|
||||||
* Global instance of PLM framework data
|
* Global instance of PLM framework data
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#endif
|
#endif
|
||||||
|
#include <ctype.h>
|
||||||
|
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
@ -76,7 +77,8 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
|||||||
/* .app_rank = */ -1,
|
/* .app_rank = */ -1,
|
||||||
/* .peer_modex = */ -1,
|
/* .peer_modex = */ -1,
|
||||||
/* .peer_init_barrier = */ -1,
|
/* .peer_init_barrier = */ -1,
|
||||||
/* .peer_fini_barrier = */ -1
|
/* .peer_fini_barrier = */ -1,
|
||||||
|
/* .strip_prefix_from_node_names = */ false
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool init=false;
|
static bool init=false;
|
||||||
@ -84,7 +86,7 @@ static bool init=false;
|
|||||||
int orte_proc_info(void)
|
int orte_proc_info(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
int tmp;
|
int tmp, idx;
|
||||||
char *uri, *ptr;
|
char *uri, *ptr;
|
||||||
char hostname[ORTE_MAX_HOSTNAME_SIZE];
|
char hostname[ORTE_MAX_HOSTNAME_SIZE];
|
||||||
|
|
||||||
@ -140,9 +142,33 @@ int orte_proc_info(void)
|
|||||||
/* get the process id */
|
/* get the process id */
|
||||||
orte_process_info.pid = getpid();
|
orte_process_info.pid = getpid();
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "strip_prefix_from_node_names",
|
||||||
|
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
||||||
|
false, false, (int)false, &tmp);
|
||||||
|
orte_process_info.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(tmp);
|
||||||
|
|
||||||
/* get the nodename */
|
/* get the nodename */
|
||||||
gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
|
gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
|
||||||
|
/* we have to strip node names here, if user directs, to ensure that
|
||||||
|
* the names exchanged in the modex match the names found locally
|
||||||
|
*/
|
||||||
|
if (orte_process_info.strip_prefix_from_node_names) {
|
||||||
|
/* remove all leading characters and zeroes */
|
||||||
|
idx = 0;
|
||||||
|
while (idx < (int)strlen(hostname) &&
|
||||||
|
(hostname[idx] <= '0' || '9' < hostname[idx])) {
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if ((int)strlen(hostname) <= idx) {
|
||||||
|
/* there were no non-zero numbers in the name */
|
||||||
orte_process_info.nodename = strdup(hostname);
|
orte_process_info.nodename = strdup(hostname);
|
||||||
|
} else {
|
||||||
|
orte_process_info.nodename = strdup(&hostname[idx]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
orte_process_info.nodename = strdup(hostname);
|
||||||
|
}
|
||||||
|
opal_output(0, "HOSTNAME: %s", orte_process_info.nodename);
|
||||||
|
|
||||||
/* get the number of nodes in the job */
|
/* get the number of nodes in the job */
|
||||||
mca_base_param_reg_int_name("orte", "num_nodes",
|
mca_base_param_reg_int_name("orte", "num_nodes",
|
||||||
|
@ -127,6 +127,7 @@ struct orte_proc_info_t {
|
|||||||
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
|
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
|
||||||
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
|
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
|
||||||
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
|
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
|
||||||
|
bool strip_prefix_from_node_names;
|
||||||
};
|
};
|
||||||
typedef struct orte_proc_info_t orte_proc_info_t;
|
typedef struct orte_proc_info_t orte_proc_info_t;
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user