For cases where the alpha+non-zero prefix must be removed from a node name, be sure to do it everywhere we access node names - otherwise, modex methods such as pmi will fail to correctly identify procs on the same node
This commit was SVN r27022.
Этот коммит содержится в:
родитель
360dcd5f50
Коммит
b9b41d8662
@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
|
||||
orte_allocation_required = 1
|
||||
|
||||
## Deal with the allocator
|
||||
plm_base_strip_prefix_from_node_names = 1
|
||||
orte_strip_prefix_from_node_names = 1
|
||||
|
||||
## MPI behavior
|
||||
## Do NOT specify mpi_leave_pinned so system
|
||||
|
@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
|
||||
orte_allocation_required = 1
|
||||
|
||||
## Deal with the allocator
|
||||
plm_base_strip_prefix_from_node_names = 1
|
||||
orte_strip_prefix_from_node_names = 1
|
||||
|
||||
## MPI behavior
|
||||
## Do NOT specify mpi_leave_pinned so system
|
||||
|
@ -620,7 +620,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
||||
nodename = ptr;
|
||||
}
|
||||
}
|
||||
if (orte_plm_globals.strip_prefix_from_node_names) {
|
||||
if (orte_process_info.strip_prefix_from_node_names) {
|
||||
/* remove all leading characters and zeroes */
|
||||
ptr = nodename;
|
||||
while (idx < (int)strlen(nodename) &&
|
||||
|
@ -89,8 +89,6 @@ orte_plm_base_module_t orte_plm = {
|
||||
*/
|
||||
int orte_plm_base_open(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
orte_plm_globals.output = opal_output_open(NULL);
|
||||
@ -104,11 +102,6 @@ int orte_plm_base_open(void)
|
||||
/* default to assigning daemons to nodes at launch */
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
|
||||
mca_base_param_reg_int_name("plm", "base_strip_prefix_from_node_names",
|
||||
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
||||
false, false, (int)false, &value);
|
||||
orte_plm_globals.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* Open up all the components that we can find */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
@ -58,8 +58,6 @@ typedef struct {
|
||||
opal_buffer_t tree_spawn_cmd;
|
||||
/* daemon nodes assigned at launch */
|
||||
bool daemon_nodes_assigned_at_launch;
|
||||
/* handle allocator-to-actual nodename matches */
|
||||
bool strip_prefix_from_node_names;
|
||||
} orte_plm_globals_t;
|
||||
/**
|
||||
* Global instance of PLM framework data
|
||||
|
@ -31,6 +31,7 @@
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
@ -41,42 +42,43 @@
|
||||
#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}
|
||||
|
||||
ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
||||
/* .my_name = */ ORTE_NAME_INVALID,
|
||||
/* .my_daemon = */ ORTE_NAME_INVALID,
|
||||
/* .my_daemon_uri = */ NULL,
|
||||
/* .my_hnp = */ ORTE_NAME_INVALID,
|
||||
/* .my_hnp_uri = */ NULL,
|
||||
/* .my_parent = */ ORTE_NAME_INVALID,
|
||||
/* .hnp_pid = */ 0,
|
||||
/* .app_num = */ 0,
|
||||
/* .num_procs = */ 1,
|
||||
/* .max_procs = */ 1,
|
||||
/* .num_daemons = */ 1,
|
||||
/* .num_nodes = */ 1,
|
||||
/* .nodename = */ NULL,
|
||||
/* .pid = */ 0,
|
||||
/* .proc_type = */ ORTE_PROC_TYPE_NONE,
|
||||
/* .sync_buf = */ NULL,
|
||||
/* .my_port = */ 0,
|
||||
/* .num_restarts = */ 0,
|
||||
/* .my_node_rank = */ ORTE_NODE_RANK_INVALID,
|
||||
/* .my_local_rank = */ ORTE_LOCAL_RANK_INVALID,
|
||||
/* .num_local_peers = */ 0,
|
||||
/* .tmpdir_base = */ NULL,
|
||||
/* .top_session_dir = */ NULL,
|
||||
/* .job_session_dir = */ NULL,
|
||||
/* .proc_session_dir = */ NULL,
|
||||
/* .sock_stdin = */ NULL,
|
||||
/* .sock_stdout = */ NULL,
|
||||
/* .sock_stderr = */ NULL,
|
||||
/* .my_name = */ ORTE_NAME_INVALID,
|
||||
/* .my_daemon = */ ORTE_NAME_INVALID,
|
||||
/* .my_daemon_uri = */ NULL,
|
||||
/* .my_hnp = */ ORTE_NAME_INVALID,
|
||||
/* .my_hnp_uri = */ NULL,
|
||||
/* .my_parent = */ ORTE_NAME_INVALID,
|
||||
/* .hnp_pid = */ 0,
|
||||
/* .app_num = */ 0,
|
||||
/* .num_procs = */ 1,
|
||||
/* .max_procs = */ 1,
|
||||
/* .num_daemons = */ 1,
|
||||
/* .num_nodes = */ 1,
|
||||
/* .nodename = */ NULL,
|
||||
/* .pid = */ 0,
|
||||
/* .proc_type = */ ORTE_PROC_TYPE_NONE,
|
||||
/* .sync_buf = */ NULL,
|
||||
/* .my_port = */ 0,
|
||||
/* .num_restarts = */ 0,
|
||||
/* .my_node_rank = */ ORTE_NODE_RANK_INVALID,
|
||||
/* .my_local_rank = */ ORTE_LOCAL_RANK_INVALID,
|
||||
/* .num_local_peers = */ 0,
|
||||
/* .tmpdir_base = */ NULL,
|
||||
/* .top_session_dir = */ NULL,
|
||||
/* .job_session_dir = */ NULL,
|
||||
/* .proc_session_dir = */ NULL,
|
||||
/* .sock_stdin = */ NULL,
|
||||
/* .sock_stdout = */ NULL,
|
||||
/* .sock_stderr = */ NULL,
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* .bind_level = */ OPAL_HWLOC_NODE_LEVEL,
|
||||
/* .bind_idx = */ 0,
|
||||
/* .bind_level = */ OPAL_HWLOC_NODE_LEVEL,
|
||||
/* .bind_idx = */ 0,
|
||||
#endif
|
||||
/* .app_rank = */ -1,
|
||||
/* .peer_modex = */ -1,
|
||||
/* .peer_init_barrier = */ -1,
|
||||
/* .peer_fini_barrier = */ -1
|
||||
/* .app_rank = */ -1,
|
||||
/* .peer_modex = */ -1,
|
||||
/* .peer_init_barrier = */ -1,
|
||||
/* .peer_fini_barrier = */ -1,
|
||||
/* .strip_prefix_from_node_names = */ false
|
||||
};
|
||||
|
||||
static bool init=false;
|
||||
@ -84,7 +86,7 @@ static bool init=false;
|
||||
int orte_proc_info(void)
|
||||
{
|
||||
|
||||
int tmp;
|
||||
int tmp, idx;
|
||||
char *uri, *ptr;
|
||||
char hostname[ORTE_MAX_HOSTNAME_SIZE];
|
||||
|
||||
@ -140,10 +142,34 @@ int orte_proc_info(void)
|
||||
/* get the process id */
|
||||
orte_process_info.pid = getpid();
|
||||
|
||||
mca_base_param_reg_int_name("orte", "strip_prefix_from_node_names",
|
||||
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
||||
false, false, (int)false, &tmp);
|
||||
orte_process_info.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
/* get the nodename */
|
||||
gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
|
||||
orte_process_info.nodename = strdup(hostname);
|
||||
|
||||
/* we have to strip node names here, if user directs, to ensure that
|
||||
* the names exchanged in the modex match the names found locally
|
||||
*/
|
||||
if (orte_process_info.strip_prefix_from_node_names) {
|
||||
/* remove all leading characters and zeroes */
|
||||
idx = 0;
|
||||
while (idx < (int)strlen(hostname) &&
|
||||
(hostname[idx] <= '0' || '9' < hostname[idx])) {
|
||||
idx++;
|
||||
}
|
||||
if ((int)strlen(hostname) <= idx) {
|
||||
/* there were no non-zero numbers in the name */
|
||||
orte_process_info.nodename = strdup(hostname);
|
||||
} else {
|
||||
orte_process_info.nodename = strdup(&hostname[idx]);
|
||||
}
|
||||
} else {
|
||||
orte_process_info.nodename = strdup(hostname);
|
||||
}
|
||||
opal_output(0, "HOSTNAME: %s", orte_process_info.nodename);
|
||||
|
||||
/* get the number of nodes in the job */
|
||||
mca_base_param_reg_int_name("orte", "num_nodes",
|
||||
"Number of nodes in the job",
|
||||
|
@ -127,6 +127,7 @@ struct orte_proc_info_t {
|
||||
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
|
||||
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
|
||||
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
|
||||
bool strip_prefix_from_node_names;
|
||||
};
|
||||
typedef struct orte_proc_info_t orte_proc_info_t;
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user