For cases where the alpha+non-zero prefix must be removed from a node name, be sure to do it everywhere we access node names - otherwise, modex methods such as pmi will fail to correctly identify procs on the same node
This commit was SVN r27022.
Этот коммит содержится в:
родитель
360dcd5f50
Коммит
b9b41d8662
@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
|
|||||||
orte_allocation_required = 1
|
orte_allocation_required = 1
|
||||||
|
|
||||||
## Deal with the allocator
|
## Deal with the allocator
|
||||||
plm_base_strip_prefix_from_node_names = 1
|
orte_strip_prefix_from_node_names = 1
|
||||||
|
|
||||||
## MPI behavior
|
## MPI behavior
|
||||||
## Do NOT specify mpi_leave_pinned so system
|
## Do NOT specify mpi_leave_pinned so system
|
||||||
|
@ -81,7 +81,7 @@ orte_tmpdir_base = /var/tmp
|
|||||||
orte_allocation_required = 1
|
orte_allocation_required = 1
|
||||||
|
|
||||||
## Deal with the allocator
|
## Deal with the allocator
|
||||||
plm_base_strip_prefix_from_node_names = 1
|
orte_strip_prefix_from_node_names = 1
|
||||||
|
|
||||||
## MPI behavior
|
## MPI behavior
|
||||||
## Do NOT specify mpi_leave_pinned so system
|
## Do NOT specify mpi_leave_pinned so system
|
||||||
|
@ -620,7 +620,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
nodename = ptr;
|
nodename = ptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (orte_plm_globals.strip_prefix_from_node_names) {
|
if (orte_process_info.strip_prefix_from_node_names) {
|
||||||
/* remove all leading characters and zeroes */
|
/* remove all leading characters and zeroes */
|
||||||
ptr = nodename;
|
ptr = nodename;
|
||||||
while (idx < (int)strlen(nodename) &&
|
while (idx < (int)strlen(nodename) &&
|
||||||
|
@ -89,8 +89,6 @@ orte_plm_base_module_t orte_plm = {
|
|||||||
*/
|
*/
|
||||||
int orte_plm_base_open(void)
|
int orte_plm_base_open(void)
|
||||||
{
|
{
|
||||||
int value;
|
|
||||||
|
|
||||||
/* Debugging / verbose output. Always have stream open, with
|
/* Debugging / verbose output. Always have stream open, with
|
||||||
verbose set by the mca open system... */
|
verbose set by the mca open system... */
|
||||||
orte_plm_globals.output = opal_output_open(NULL);
|
orte_plm_globals.output = opal_output_open(NULL);
|
||||||
@ -104,11 +102,6 @@ int orte_plm_base_open(void)
|
|||||||
/* default to assigning daemons to nodes at launch */
|
/* default to assigning daemons to nodes at launch */
|
||||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||||
|
|
||||||
mca_base_param_reg_int_name("plm", "base_strip_prefix_from_node_names",
|
|
||||||
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
|
||||||
false, false, (int)false, &value);
|
|
||||||
orte_plm_globals.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(value);
|
|
||||||
|
|
||||||
/* Open up all the components that we can find */
|
/* Open up all the components that we can find */
|
||||||
|
|
||||||
if (ORTE_SUCCESS !=
|
if (ORTE_SUCCESS !=
|
||||||
|
@ -58,8 +58,6 @@ typedef struct {
|
|||||||
opal_buffer_t tree_spawn_cmd;
|
opal_buffer_t tree_spawn_cmd;
|
||||||
/* daemon nodes assigned at launch */
|
/* daemon nodes assigned at launch */
|
||||||
bool daemon_nodes_assigned_at_launch;
|
bool daemon_nodes_assigned_at_launch;
|
||||||
/* handle allocator-to-actual nodename matches */
|
|
||||||
bool strip_prefix_from_node_names;
|
|
||||||
} orte_plm_globals_t;
|
} orte_plm_globals_t;
|
||||||
/**
|
/**
|
||||||
* Global instance of PLM framework data
|
* Global instance of PLM framework data
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#endif
|
#endif
|
||||||
|
#include <ctype.h>
|
||||||
|
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
@ -41,42 +42,43 @@
|
|||||||
#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}
|
#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}
|
||||||
|
|
||||||
ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
||||||
/* .my_name = */ ORTE_NAME_INVALID,
|
/* .my_name = */ ORTE_NAME_INVALID,
|
||||||
/* .my_daemon = */ ORTE_NAME_INVALID,
|
/* .my_daemon = */ ORTE_NAME_INVALID,
|
||||||
/* .my_daemon_uri = */ NULL,
|
/* .my_daemon_uri = */ NULL,
|
||||||
/* .my_hnp = */ ORTE_NAME_INVALID,
|
/* .my_hnp = */ ORTE_NAME_INVALID,
|
||||||
/* .my_hnp_uri = */ NULL,
|
/* .my_hnp_uri = */ NULL,
|
||||||
/* .my_parent = */ ORTE_NAME_INVALID,
|
/* .my_parent = */ ORTE_NAME_INVALID,
|
||||||
/* .hnp_pid = */ 0,
|
/* .hnp_pid = */ 0,
|
||||||
/* .app_num = */ 0,
|
/* .app_num = */ 0,
|
||||||
/* .num_procs = */ 1,
|
/* .num_procs = */ 1,
|
||||||
/* .max_procs = */ 1,
|
/* .max_procs = */ 1,
|
||||||
/* .num_daemons = */ 1,
|
/* .num_daemons = */ 1,
|
||||||
/* .num_nodes = */ 1,
|
/* .num_nodes = */ 1,
|
||||||
/* .nodename = */ NULL,
|
/* .nodename = */ NULL,
|
||||||
/* .pid = */ 0,
|
/* .pid = */ 0,
|
||||||
/* .proc_type = */ ORTE_PROC_TYPE_NONE,
|
/* .proc_type = */ ORTE_PROC_TYPE_NONE,
|
||||||
/* .sync_buf = */ NULL,
|
/* .sync_buf = */ NULL,
|
||||||
/* .my_port = */ 0,
|
/* .my_port = */ 0,
|
||||||
/* .num_restarts = */ 0,
|
/* .num_restarts = */ 0,
|
||||||
/* .my_node_rank = */ ORTE_NODE_RANK_INVALID,
|
/* .my_node_rank = */ ORTE_NODE_RANK_INVALID,
|
||||||
/* .my_local_rank = */ ORTE_LOCAL_RANK_INVALID,
|
/* .my_local_rank = */ ORTE_LOCAL_RANK_INVALID,
|
||||||
/* .num_local_peers = */ 0,
|
/* .num_local_peers = */ 0,
|
||||||
/* .tmpdir_base = */ NULL,
|
/* .tmpdir_base = */ NULL,
|
||||||
/* .top_session_dir = */ NULL,
|
/* .top_session_dir = */ NULL,
|
||||||
/* .job_session_dir = */ NULL,
|
/* .job_session_dir = */ NULL,
|
||||||
/* .proc_session_dir = */ NULL,
|
/* .proc_session_dir = */ NULL,
|
||||||
/* .sock_stdin = */ NULL,
|
/* .sock_stdin = */ NULL,
|
||||||
/* .sock_stdout = */ NULL,
|
/* .sock_stdout = */ NULL,
|
||||||
/* .sock_stderr = */ NULL,
|
/* .sock_stderr = */ NULL,
|
||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
/* .bind_level = */ OPAL_HWLOC_NODE_LEVEL,
|
/* .bind_level = */ OPAL_HWLOC_NODE_LEVEL,
|
||||||
/* .bind_idx = */ 0,
|
/* .bind_idx = */ 0,
|
||||||
#endif
|
#endif
|
||||||
/* .app_rank = */ -1,
|
/* .app_rank = */ -1,
|
||||||
/* .peer_modex = */ -1,
|
/* .peer_modex = */ -1,
|
||||||
/* .peer_init_barrier = */ -1,
|
/* .peer_init_barrier = */ -1,
|
||||||
/* .peer_fini_barrier = */ -1
|
/* .peer_fini_barrier = */ -1,
|
||||||
|
/* .strip_prefix_from_node_names = */ false
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool init=false;
|
static bool init=false;
|
||||||
@ -84,7 +86,7 @@ static bool init=false;
|
|||||||
int orte_proc_info(void)
|
int orte_proc_info(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
int tmp;
|
int tmp, idx;
|
||||||
char *uri, *ptr;
|
char *uri, *ptr;
|
||||||
char hostname[ORTE_MAX_HOSTNAME_SIZE];
|
char hostname[ORTE_MAX_HOSTNAME_SIZE];
|
||||||
|
|
||||||
@ -140,10 +142,34 @@ int orte_proc_info(void)
|
|||||||
/* get the process id */
|
/* get the process id */
|
||||||
orte_process_info.pid = getpid();
|
orte_process_info.pid = getpid();
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "strip_prefix_from_node_names",
|
||||||
|
"Whether to strip leading characters and zeroes from node names returned by daemons",
|
||||||
|
false, false, (int)false, &tmp);
|
||||||
|
orte_process_info.strip_prefix_from_node_names = OPAL_INT_TO_BOOL(tmp);
|
||||||
|
|
||||||
/* get the nodename */
|
/* get the nodename */
|
||||||
gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
|
gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
|
||||||
orte_process_info.nodename = strdup(hostname);
|
/* we have to strip node names here, if user directs, to ensure that
|
||||||
|
* the names exchanged in the modex match the names found locally
|
||||||
|
*/
|
||||||
|
if (orte_process_info.strip_prefix_from_node_names) {
|
||||||
|
/* remove all leading characters and zeroes */
|
||||||
|
idx = 0;
|
||||||
|
while (idx < (int)strlen(hostname) &&
|
||||||
|
(hostname[idx] <= '0' || '9' < hostname[idx])) {
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
if ((int)strlen(hostname) <= idx) {
|
||||||
|
/* there were no non-zero numbers in the name */
|
||||||
|
orte_process_info.nodename = strdup(hostname);
|
||||||
|
} else {
|
||||||
|
orte_process_info.nodename = strdup(&hostname[idx]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
orte_process_info.nodename = strdup(hostname);
|
||||||
|
}
|
||||||
|
opal_output(0, "HOSTNAME: %s", orte_process_info.nodename);
|
||||||
|
|
||||||
/* get the number of nodes in the job */
|
/* get the number of nodes in the job */
|
||||||
mca_base_param_reg_int_name("orte", "num_nodes",
|
mca_base_param_reg_int_name("orte", "num_nodes",
|
||||||
"Number of nodes in the job",
|
"Number of nodes in the job",
|
||||||
|
@ -127,6 +127,7 @@ struct orte_proc_info_t {
|
|||||||
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
|
orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */
|
||||||
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
|
orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */
|
||||||
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
|
orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */
|
||||||
|
bool strip_prefix_from_node_names;
|
||||||
};
|
};
|
||||||
typedef struct orte_proc_info_t orte_proc_info_t;
|
typedef struct orte_proc_info_t orte_proc_info_t;
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user