Bring in a forgotten speed improvement for the TM launcher that was developed during SNL Tbird testing last year. Remove the redundant and slow calls to TM to resolve hostnames. Instead, read the host info from the PBS file during the RAS, and then just use that info in the PLS (rather than getting it again).
Adjust the RMAPS mapped_node object to propagate the required launch_id info now included in the ras_node object. This provides support for those few systems that don't use nodename to launch, but instead want some id (typically an index into the array of allocated nodes). This value gets set for each node in the RAS - the RMAPS just propagates it for easy launch. This commit was SVN r13581.
Этот коммит содержится в:
родитель
64bf42fc0d
Коммит
5818a32245
@ -89,20 +89,11 @@ static int pls_tm_finalize(void);
|
||||
|
||||
static int pls_tm_connect(void);
|
||||
static int pls_tm_disconnect(void);
|
||||
static int pls_tm_query_hostnames(void);
|
||||
static int pls_tm_start_proc(char *nodename, int argc, char **argv,
|
||||
char **env, tm_task_id *task_id,
|
||||
tm_event_t *event);
|
||||
static int pls_tm_check_path(char *exe, char **env);
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
/* Resolving TM hostname */
|
||||
static char **tm_hostnames = NULL;
|
||||
static tm_node_id *tm_node_ids = NULL;
|
||||
static int num_tm_hostnames = 0, num_node_ids = 0;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
@ -298,15 +289,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
connected = true;
|
||||
|
||||
/* Resolve the TM hostnames and TD node ID's (guarantee that we
|
||||
don't mix any of these TM events in with the TM spawn events,
|
||||
so that we can poll for each set of events without interference
|
||||
from the other */
|
||||
rc = pls_tm_query_hostnames();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There is a
|
||||
lengthy comment about this in pls_rsh_module.c explaining all
|
||||
the rationale for how / why we're doing this. */
|
||||
@ -440,9 +422,11 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
}
|
||||
|
||||
rc = pls_tm_start_proc(node->nodename, argc, argv, env,
|
||||
tm_task_ids + launched,
|
||||
tm_events + launched);
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:tm: start_procs returned error %d", rc);
|
||||
goto cleanup;
|
||||
@ -689,17 +673,6 @@ static int pls_tm_finalize(void)
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (NULL != tm_hostnames) {
|
||||
opal_argv_free(tm_hostnames);
|
||||
tm_hostnames = NULL;
|
||||
num_tm_hostnames = 0;
|
||||
}
|
||||
if (NULL != tm_node_ids) {
|
||||
free(tm_node_ids);
|
||||
tm_node_ids = NULL;
|
||||
num_node_ids = 0;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -739,142 +712,6 @@ static int pls_tm_disconnect(void)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* For a given TM node ID, get the string hostname corresponding to
|
||||
* it.
|
||||
*/
|
||||
static char *get_tm_hostname(tm_node_id node)
|
||||
{
|
||||
char *hostname;
|
||||
char buffer[256];
|
||||
int ret, local_errno;
|
||||
tm_event_t event;
|
||||
char **argv;
|
||||
|
||||
/* Get the info string corresponding to this TM node ID */
|
||||
|
||||
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
|
||||
if (TM_SUCCESS != ret) {
|
||||
opal_output(0, "tm_rescinfo returned %d\n", ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Now wait for that event to happen */
|
||||
|
||||
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
|
||||
if (TM_SUCCESS != ret) {
|
||||
opal_output(0, "tm_poll returned %d\n", ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* According to the TM man page, we get back a space-separated
|
||||
string array. The hostname is the second item. Use a cheap
|
||||
trick to get it. */
|
||||
|
||||
buffer[sizeof(buffer) - 1] = '\0';
|
||||
argv = opal_argv_split(buffer, ' ');
|
||||
if (NULL == argv) {
|
||||
opal_output(0, "opal_argv_split failed\n");
|
||||
return NULL;
|
||||
}
|
||||
hostname = strdup(argv[1]);
|
||||
opal_argv_free(argv);
|
||||
|
||||
/* All done */
|
||||
|
||||
return hostname;
|
||||
}
|
||||
|
||||
|
||||
static int pls_tm_query_hostnames(void)
|
||||
{
|
||||
char *h;
|
||||
int i, ret;
|
||||
|
||||
/* Get the list of nodes allocated in this PBS job */
|
||||
|
||||
ret = tm_nodeinfo(&tm_node_ids, &num_node_ids);
|
||||
if (TM_SUCCESS != ret) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* TM "nodes" may actually correspond to PBS "VCPUs", which means
|
||||
there may be multiple "TM nodes" that correspond to the same
|
||||
physical node. This doesn't really affect what we're doing
|
||||
here (we actually ignore the fact that they're duplicates --
|
||||
slightly inefficient, but no big deal); just mentioned for
|
||||
completeness... */
|
||||
|
||||
tm_hostnames = NULL;
|
||||
num_tm_hostnames = 0;
|
||||
for (i = 0; i < num_node_ids; ++i) {
|
||||
h = get_tm_hostname(tm_node_ids[i]);
|
||||
if (NULL == h) {
|
||||
opal_output(0, "get_tm_hostname returned NULL");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(&num_tm_hostnames, &tm_hostnames, h);
|
||||
free(h);
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int do_tm_resolve(char *hostname, tm_node_id *tnodeid)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
/* Have we already queried TM for all the node info? */
|
||||
if (NULL == tm_hostnames) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Find the TM ID of the hostname that we're looking for */
|
||||
for (i = 0; i < num_tm_hostnames; ++i) {
|
||||
if (0 == strcmp(hostname, tm_hostnames[i])) {
|
||||
*tnodeid = tm_node_ids[i];
|
||||
if (mca_pls_tm_component.debug) {
|
||||
opal_output(0, "pls:tm:launch: resolved host %s to node ID %d",
|
||||
hostname, tm_node_ids[i]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
if (i < num_tm_hostnames) {
|
||||
ret = ORTE_SUCCESS;
|
||||
} else {
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int pls_tm_start_proc(char *nodename, int argc, char **argv, char **env,
|
||||
tm_task_id *task_id, tm_event_t *event)
|
||||
{
|
||||
int ret;
|
||||
tm_node_id node_id;
|
||||
|
||||
/* get the tm node id for this node */
|
||||
ret = do_tm_resolve(nodename, &node_id);
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = tm_spawn(argc, argv, env, node_id, task_id, event);
|
||||
if (TM_SUCCESS != ret) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int pls_tm_check_path(char *exe, char **env)
|
||||
{
|
||||
static int size = 256;
|
||||
|
@ -45,6 +45,7 @@ int orte_ras_base_copy_node(orte_ras_node_t **dest, orte_ras_node_t *src, orte_d
|
||||
|
||||
/* copy data into it */
|
||||
if (NULL != src->node_name) (*dest)->node_name = strdup(src->node_name);
|
||||
(*dest)->launch_id = src->launch_id;
|
||||
if (NULL != src->node_arch) (*dest)->node_arch = strdup(src->node_arch);
|
||||
(*dest)->node_cellid = src->node_cellid;
|
||||
(*dest)->node_state = src->node_state;
|
||||
|
@ -51,6 +51,13 @@ int orte_ras_base_pack_node(orte_buffer_t *buffer, void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the launch id */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->launch_id)), 1, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the arch */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->node_arch)), 1, ORTE_STRING))) {
|
||||
|
@ -48,8 +48,8 @@ int orte_ras_base_print_node(char **output, char *prefix, orte_ras_node_t *src,
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "%sData for node: cellid: %lu\tName: %s",
|
||||
pfx2, (unsigned long)src->node_cellid, src->node_name);
|
||||
asprintf(&tmp, "%sData for node: cellid: %lu\tName: %s\tLaunch id: %ld",
|
||||
pfx2, (unsigned long)src->node_cellid, src->node_name, (long)src->launch_id);
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tArch: %s\tState: %lu", tmp, pfx2,
|
||||
src->node_arch, (unsigned long)src->node_state);
|
||||
|
@ -61,6 +61,14 @@ int orte_ras_base_unpack_node(orte_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the launch id */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
|
||||
(&(nodes[i]->launch_id)), &n, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the arch */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
|
||||
|
@ -33,6 +33,7 @@
|
||||
static void orte_ras_base_node_construct(orte_ras_node_t* node)
|
||||
{
|
||||
node->node_name = NULL;
|
||||
node->launch_id = -1;
|
||||
node->node_arch = NULL;
|
||||
node->node_cellid = 0;
|
||||
node->node_state = ORTE_NODE_STATE_UNKNOWN;
|
||||
@ -71,9 +72,23 @@ OBJ_CLASS_INSTANCE(
|
||||
|
||||
int orte_ras_base_node_query(opal_list_t* nodes)
|
||||
{
|
||||
char* keys[] = {
|
||||
ORTE_NODE_NAME_KEY,
|
||||
ORTE_NODE_LAUNCH_ID_KEY,
|
||||
ORTE_NODE_ARCH_KEY,
|
||||
ORTE_NODE_STATE_KEY,
|
||||
ORTE_NODE_SLOTS_KEY,
|
||||
ORTE_NODE_SLOTS_IN_USE_KEY,
|
||||
ORTE_NODE_SLOTS_ALLOC_KEY,
|
||||
ORTE_NODE_SLOTS_MAX_KEY,
|
||||
ORTE_NODE_USERNAME_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
NULL
|
||||
};
|
||||
orte_std_cntr_t i, cnt, *sptr;
|
||||
orte_node_state_t *nsptr;
|
||||
orte_cellid_t *cptr;
|
||||
int32_t *i32;
|
||||
orte_gpr_value_t** values;
|
||||
int rc;
|
||||
|
||||
@ -82,7 +97,7 @@ int orte_ras_base_node_query(opal_list_t* nodes)
|
||||
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
ORTE_NODE_SEGMENT,
|
||||
NULL,
|
||||
NULL,
|
||||
keys,
|
||||
&cnt,
|
||||
&values);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -108,6 +123,14 @@ int orte_ras_base_node_query(opal_list_t* nodes)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_LAUNCH_ID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&i32, keyval->value, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
node->launch_id = *i32;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_ARCH_KEY) == 0) {
|
||||
/* we use the dss.copy function here instead of strdup because that function
|
||||
* automatically protects us against a NULL (or zero-length) string
|
||||
@ -193,6 +216,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
{
|
||||
char* keys[] = {
|
||||
ORTE_NODE_NAME_KEY,
|
||||
ORTE_NODE_LAUNCH_ID_KEY,
|
||||
ORTE_NODE_ARCH_KEY,
|
||||
ORTE_NODE_STATE_KEY,
|
||||
ORTE_NODE_SLOTS_KEY,
|
||||
@ -209,6 +233,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
orte_std_cntr_t *sptr;
|
||||
orte_node_state_t *nsptr;
|
||||
orte_cellid_t *cptr;
|
||||
int32_t *i32;
|
||||
int rc, alloc_key_posn=5;
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_str, jobid))) {
|
||||
@ -266,6 +291,14 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_LAUNCH_ID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&i32, keyval->value, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
node->launch_id = *i32;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_ARCH_KEY) == 0) {
|
||||
/* we use the dss.copy function here instead of strdup because that function
|
||||
* automatically protects us against a NULL (or zero-length) string
|
||||
@ -363,11 +396,25 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
|
||||
orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t cellid, const char* node_name)
|
||||
{
|
||||
char* keys[] = {
|
||||
ORTE_NODE_NAME_KEY,
|
||||
ORTE_NODE_LAUNCH_ID_KEY,
|
||||
ORTE_NODE_ARCH_KEY,
|
||||
ORTE_NODE_STATE_KEY,
|
||||
ORTE_NODE_SLOTS_KEY,
|
||||
ORTE_NODE_SLOTS_IN_USE_KEY,
|
||||
ORTE_NODE_SLOTS_ALLOC_KEY,
|
||||
ORTE_NODE_SLOTS_MAX_KEY,
|
||||
ORTE_NODE_USERNAME_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
NULL
|
||||
};
|
||||
orte_ras_node_t* node = NULL;
|
||||
orte_std_cntr_t i, cnt, num_tokens;
|
||||
orte_std_cntr_t *sptr;
|
||||
orte_cellid_t *cptr;
|
||||
orte_node_state_t *nsptr;
|
||||
int32_t *i32;
|
||||
orte_gpr_value_t** values;
|
||||
char** tokens = NULL;
|
||||
int rc;
|
||||
@ -383,7 +430,7 @@ orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t cellid, const char* nod
|
||||
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
ORTE_NODE_SEGMENT,
|
||||
tokens,
|
||||
NULL,
|
||||
keys,
|
||||
&cnt,
|
||||
&values);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -409,6 +456,14 @@ orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t cellid, const char* nod
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_LAUNCH_ID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&i32, keyval->value, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
node->launch_id = *i32;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_ARCH_KEY) == 0) {
|
||||
/* we use the dss.copy function here instead of strdup because that function
|
||||
* automatically protects us against a NULL (or zero-length) string
|
||||
@ -500,6 +555,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
orte_std_cntr_t num_values, i, j;
|
||||
char *keys[] = {
|
||||
ORTE_NODE_NAME_KEY,
|
||||
ORTE_NODE_LAUNCH_ID_KEY,
|
||||
ORTE_NODE_ARCH_KEY,
|
||||
ORTE_NODE_STATE_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
@ -510,6 +566,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
};
|
||||
orte_data_type_t types[] = {
|
||||
ORTE_STRING,
|
||||
ORTE_INT32,
|
||||
ORTE_STRING,
|
||||
ORTE_NODE_STATE,
|
||||
ORTE_CELLID,
|
||||
@ -535,7 +592,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
for (i=0; i < num_values; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
|
||||
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
|
||||
ORTE_NODE_SEGMENT, 8, 0))) {
|
||||
ORTE_NODE_SEGMENT, 9, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
for (j=0; j < i; j++) {
|
||||
OBJ_RELEASE(values[j]);
|
||||
@ -556,6 +613,12 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
++j;
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->launch_id)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
++j;
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], node->node_arch))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -46,6 +46,8 @@ struct orte_ras_node_t {
|
||||
opal_list_item_t super;
|
||||
/** String node name */
|
||||
char *node_name;
|
||||
/** Launch id - needed by some systems to launch a proc on this node */
|
||||
int32_t launch_id;
|
||||
/** String of the architecture for the node. This is permitted to
|
||||
be NULL if it is not known. */
|
||||
char *node_arch;
|
||||
|
@ -30,7 +30,13 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_tm_component;
|
||||
struct orte_ras_tm_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
char *nodefile_dir;
|
||||
};
|
||||
typedef struct orte_ras_tm_component_t orte_ras_tm_component_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_tm_component_t mca_ras_tm_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_tm_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
|
@ -39,47 +39,56 @@ static int ras_tm_open(void);
|
||||
static orte_ras_base_module_t *ras_tm_init(int*);
|
||||
|
||||
|
||||
orte_ras_base_component_t mca_ras_tm_component = {
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
orte_ras_tm_component_t mca_ras_tm_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
/* Component name and version */
|
||||
|
||||
"tm",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
ras_tm_open,
|
||||
NULL
|
||||
},
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
/* Component name and version */
|
||||
|
||||
"tm",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
ras_tm_open,
|
||||
NULL
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
ras_tm_init
|
||||
ras_tm_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int ras_tm_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_ras_tm_component.super.ras_version;
|
||||
|
||||
param_priority =
|
||||
mca_base_param_reg_int(&mca_ras_tm_component.ras_version,
|
||||
mca_base_param_reg_int(c,
|
||||
"priority",
|
||||
"Priority of the tm ras component",
|
||||
false, false, 100, NULL);
|
||||
|
||||
mca_base_param_reg_string(c, "nodefile_dir",
|
||||
"The directory where the PBS nodefile can be found",
|
||||
false, false, "/var/torque/aux",
|
||||
&mca_ras_tm_component.nodefile_dir);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -27,10 +27,9 @@
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "tm.h"
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/os_path.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
@ -47,9 +46,10 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||
static int deallocate(orte_jobid_t jobid);
|
||||
static int finalize(void);
|
||||
|
||||
static int discover(opal_list_t* nodelist);
|
||||
static int get_tm_hostname(tm_node_id node, char **hostname, char **arch);
|
||||
static int discover(opal_list_t* nodelist, char *pbs_jobid);
|
||||
static char *tm_getline(FILE *fp);
|
||||
|
||||
#define TM_FILE_MAX_LINE_LENGTH 512
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
@ -70,28 +70,23 @@ orte_ras_base_module_t orte_ras_tm_module = {
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
int ret;
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
struct tm_roots root;
|
||||
char *pbs_jobid;
|
||||
|
||||
/* Open up our connection to tm */
|
||||
|
||||
ret = tm_init(NULL, &root);
|
||||
if (TM_SUCCESS != ret) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:allocate: tm_init failed!");
|
||||
return ORTE_ERR_RESOURCE_BUSY;
|
||||
/* get our PBS jobid from the environment */
|
||||
if (NULL == (pbs_jobid = getenv("PBS_JOBID"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = discover(&nodes))) {
|
||||
if (ORTE_SUCCESS != (ret = discover(&nodes, pbs_jobid))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:allocate: discover failed!");
|
||||
tm_finalize();
|
||||
return ret;
|
||||
}
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
@ -110,7 +105,6 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:allocate: failure (base_allocate_nodes=%d)", ret);
|
||||
}
|
||||
tm_finalize();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -145,14 +139,15 @@ static int finalize(void)
|
||||
* - check for additional nodes that have already been allocated
|
||||
*/
|
||||
|
||||
static int discover(opal_list_t* nodelist)
|
||||
static int discover(opal_list_t* nodelist, char *pbs_jobid)
|
||||
{
|
||||
int i, ret, num_node_ids;
|
||||
int ret;
|
||||
int32_t nodeid;
|
||||
orte_ras_node_t *node;
|
||||
opal_list_item_t* item;
|
||||
opal_list_t new_nodes;
|
||||
tm_node_id *tm_node_ids;
|
||||
char *hostname, *arch;
|
||||
FILE *fp;
|
||||
char *hostname, *filename;
|
||||
struct timeval start, stop;
|
||||
|
||||
/* check for timing request - get start time if so */
|
||||
@ -170,30 +165,24 @@ static int discover(opal_list_t* nodelist)
|
||||
slightly inefficient, but no big deal); just mentioned for
|
||||
completeness... */
|
||||
|
||||
ret = tm_nodeinfo(&tm_node_ids, &num_node_ids);
|
||||
if (ret != TM_SUCCESS) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:allocate:discover: tm_nodeinfo failed");
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
/* setup the full path to the PBS file */
|
||||
filename = opal_os_path(false, mca_ras_tm_component.nodefile_dir,
|
||||
pbs_jobid, NULL);
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
free(filename);
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (orte_ras_base.timing) {
|
||||
gettimeofday(&stop, NULL);
|
||||
opal_output(0, "ras_tm: time to do nodeinfo is %ld usec",
|
||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||
(stop.tv_usec - start.tv_usec)));
|
||||
gettimeofday(&start, NULL);
|
||||
}
|
||||
|
||||
/* Iterate through all the nodes and make an entry for each. TM
|
||||
node ID's will never be duplicated, but they may end up
|
||||
resolving to the same hostname (i.e., vcpu's on a single
|
||||
host). */
|
||||
|
||||
OBJ_CONSTRUCT(&new_nodes, opal_list_t);
|
||||
for (i = 0; i < num_node_ids; ++i) {
|
||||
get_tm_hostname(tm_node_ids[i], &hostname, &arch);
|
||||
nodeid=0;
|
||||
while (NULL != (hostname = tm_getline(fp))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:allocate:discover: got hostname %s", hostname);
|
||||
|
||||
@ -223,8 +212,7 @@ static int discover(opal_list_t* nodelist)
|
||||
"ras:tm:allocate:discover: not found -- added to list");
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
node->node_name = hostname;
|
||||
node->node_arch = arch;
|
||||
node->node_state = ORTE_NODE_STATE_UP;
|
||||
node->launch_id = nodeid;
|
||||
node->node_cellid = 0;
|
||||
node->node_slots_inuse = 0;
|
||||
node->node_slots_max = 0;
|
||||
@ -232,23 +220,15 @@ static int discover(opal_list_t* nodelist)
|
||||
opal_list_append(&new_nodes, &node->super);
|
||||
} else {
|
||||
|
||||
/* Yes, so we need to free the hostname that came back
|
||||
from get_tm_hostname() */
|
||||
|
||||
/* Yes, so we need to free the hostname that came back */
|
||||
free(hostname);
|
||||
}
|
||||
|
||||
/* up the nodeid */
|
||||
nodeid++;
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (orte_ras_base.timing) {
|
||||
gettimeofday(&stop, NULL);
|
||||
opal_output(0, "ras_tm: time to get hostnames is %ld usec",
|
||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||
(stop.tv_usec - start.tv_usec)));
|
||||
gettimeofday(&start, NULL);
|
||||
}
|
||||
|
||||
/* Add these nodes to the registry, and return all the values */
|
||||
/* Add these nodes to the registry */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:allocate:discover: done -- adding to registry");
|
||||
@ -271,55 +251,31 @@ static int discover(opal_list_t* nodelist)
|
||||
"ras:tm:allocate:discover: failed (rc=%d)", ret);
|
||||
}
|
||||
OBJ_DESTRUCT(&new_nodes);
|
||||
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (orte_ras_base.timing) {
|
||||
gettimeofday(&stop, NULL);
|
||||
opal_output(0, "ras_tm: time to allocate is %ld usec",
|
||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||
(stop.tv_usec - start.tv_usec)));
|
||||
gettimeofday(&start, NULL);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* For a given TM node ID, get the string hostname corresponding to
|
||||
* it.
|
||||
*/
|
||||
static int get_tm_hostname(tm_node_id node, char **hostname, char **arch)
|
||||
static char *tm_getline(FILE *fp)
|
||||
{
|
||||
int ret, local_errno;
|
||||
tm_event_t event;
|
||||
char buffer[256];
|
||||
char **argv;
|
||||
|
||||
/* Get the info string corresponding to this TM node ID */
|
||||
|
||||
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
|
||||
if (TM_SUCCESS != ret) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:hostname: tm_rescinfo failed");
|
||||
return ORTE_ERROR;
|
||||
char *ret, *buff;
|
||||
char input[TM_FILE_MAX_LINE_LENGTH];
|
||||
|
||||
ret = fgets(input, TM_FILE_MAX_LINE_LENGTH, fp);
|
||||
if (NULL != ret) {
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
buff = strdup(input);
|
||||
return buff;
|
||||
}
|
||||
|
||||
/* Now wait for that event to happen */
|
||||
|
||||
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
|
||||
if (TM_SUCCESS != ret) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* According to the TM man page, we get back a space-separated
|
||||
string array. The hostname is the second item. Use a cheap
|
||||
trick to get it. */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:hostname: got back %s", buffer);
|
||||
buffer[sizeof(buffer) - 1] = '\0';
|
||||
argv = opal_argv_split(buffer, ' ');
|
||||
if (NULL == argv) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
*hostname = strdup(argv[1]);
|
||||
*arch = strdup(buffer);
|
||||
opal_argv_free(argv);
|
||||
|
||||
/* All done */
|
||||
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:tm:hostname: got hostname %s", *hostname);
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -149,6 +149,8 @@ int orte_rmaps_base_copy_mapped_node(orte_mapped_node_t **dest, orte_mapped_node
|
||||
(*dest)->nodename = strdup(src->nodename);
|
||||
}
|
||||
|
||||
(*dest)->launch_id = src->launch_id;
|
||||
|
||||
if (NULL != src->username) {
|
||||
(*dest)->username = strdup(src->username);
|
||||
}
|
||||
|
@ -179,6 +179,12 @@ int orte_rmaps_base_pack_mapped_node(orte_buffer_t *buffer, void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the launch id */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(nodes[i]->launch_id), 1, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the username */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(nodes[i]->username), 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -159,8 +159,8 @@ int orte_rmaps_base_print_mapped_node(char **output, char *prefix, orte_mapped_n
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "%sMapped node:\n%s\tCell: %ld\tNodename: %s\tUsername: %s\n%s\tDaemon name:", pfx2, pfx2,
|
||||
(long)src->cell, (NULL == src->nodename ? "NULL" : src->nodename),
|
||||
asprintf(&tmp, "%sMapped node:\n%s\tCell: %ld\tNodename: %s\tLaunch id: %ld\tUsername: %s\n%s\tDaemon name:", pfx2, pfx2,
|
||||
(long)src->cell, (NULL == src->nodename ? "NULL" : src->nodename), (long)src->launch_id,
|
||||
(NULL == src->username ? "NULL" : src->username), pfx2);
|
||||
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
|
@ -222,6 +222,13 @@ int orte_rmaps_base_unpack_mapped_node(orte_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the launch id */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &(nodes[i]->launch_id), &n, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the username */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
|
||||
|
@ -51,6 +51,7 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
|
||||
bool *bptr, oversub=false;
|
||||
pid_t *pidptr;
|
||||
orte_process_name_t *pptr;
|
||||
int32_t *i32, launch_id;
|
||||
char *segment;
|
||||
char *node_name=NULL;
|
||||
char *username=NULL;
|
||||
@ -65,6 +66,7 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
|
||||
ORTE_PROC_LOCAL_PID_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
ORTE_NODE_NAME_KEY,
|
||||
ORTE_NODE_LAUNCH_ID_KEY,
|
||||
ORTE_NODE_USERNAME_KEY,
|
||||
ORTE_NODE_OVERSUBSCRIBED_KEY,
|
||||
ORTE_JOB_VPID_START_KEY,
|
||||
@ -124,6 +126,7 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
|
||||
for(v=0; v<num_values; v++) {
|
||||
value = values[v];
|
||||
node_name = NULL;
|
||||
launch_id = -1;
|
||||
|
||||
if (0 == strcmp(value->tokens[0], ORTE_JOB_GLOBALS)) {
|
||||
/* this came from the job_globals container, so look for the related values */
|
||||
@ -183,6 +186,14 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
|
||||
proc->name = *pptr;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_NODE_LAUNCH_ID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&i32, keyval->value, ORTE_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
launch_id = *i32;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -233,7 +244,7 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
|
||||
}
|
||||
}
|
||||
/* store this process in the map */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(mapping, cell, node_name, username, oversub, proc))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(mapping, cell, node_name, launch_id, username, oversub, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
@ -382,7 +393,7 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
|
||||
for(i=0; i<num_procs; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
|
||||
ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_AND,
|
||||
segment, 8, 0))) {
|
||||
segment, 9, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
for(j=0; j<i; j++) {
|
||||
OBJ_RELEASE(values[j]);
|
||||
@ -427,22 +438,27 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[4]), ORTE_NODE_USERNAME_KEY, ORTE_STRING, node->username))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[4]), ORTE_NODE_LAUNCH_ID_KEY, ORTE_INT32, &(node->launch_id)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[5]), ORTE_NODE_OVERSUBSCRIBED_KEY, ORTE_BOOL, &(node->oversubscribed)))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[5]), ORTE_NODE_USERNAME_KEY, ORTE_STRING, node->username))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[6]), ORTE_PROC_APP_CONTEXT_KEY, ORTE_STD_CNTR, &(proc->app_idx)))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[6]), ORTE_NODE_OVERSUBSCRIBED_KEY, ORTE_BOOL, &(node->oversubscribed)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[7]), ORTE_PROC_APP_CONTEXT_KEY, ORTE_STD_CNTR, &(proc->app_idx)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[7]), ORTE_PROC_STATE_KEY, ORTE_PROC_STATE, &proc_state))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[8]), ORTE_PROC_STATE_KEY, ORTE_PROC_STATE, &proc_state))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -262,7 +262,7 @@ int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_list,
|
||||
}
|
||||
|
||||
|
||||
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_cellid_t cell, char *nodename,
|
||||
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_cellid_t cell, char *nodename, int32_t launch_id,
|
||||
char *username, bool oversubscribed, orte_mapped_proc_t *proc)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
@ -294,6 +294,7 @@ int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_cellid_t cell, cha
|
||||
if (NULL != username) {
|
||||
node->username = strdup(username);
|
||||
}
|
||||
node->launch_id = launch_id;
|
||||
node->oversubscribed = oversubscribed;
|
||||
opal_list_append(&map->nodes, &node->super);
|
||||
|
||||
@ -352,6 +353,7 @@ int orte_rmaps_base_claim_slot(orte_job_map_t *map,
|
||||
/* add the proc to the map */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(map, current_node->node_cellid,
|
||||
current_node->node_name,
|
||||
current_node->launch_id,
|
||||
current_node->node_username,
|
||||
oversub, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -63,6 +63,7 @@ OBJ_CLASS_INSTANCE(orte_mapped_proc_t,
|
||||
static void orte_rmaps_mapped_node_construct(orte_mapped_node_t* node)
|
||||
{
|
||||
node->nodename = NULL;
|
||||
node->launch_id = -1;
|
||||
node->username = NULL;
|
||||
node->daemon = NULL;
|
||||
node->oversubscribed = false;
|
||||
|
@ -158,7 +158,7 @@ void orte_rmaps_base_recv(int status, orte_process_name_t* sender,
|
||||
* procs. If not, then add new node entry and put this proc
|
||||
* on its list.
|
||||
*/
|
||||
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_cellid_t cell, char *nodename,
|
||||
int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_cellid_t cell, char *nodename, int32_t launch_id,
|
||||
char *username, bool oversubscribed, orte_mapped_proc_t *proc);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list, orte_jobid_t jobid,
|
||||
|
@ -66,6 +66,7 @@ struct orte_mapped_node_t {
|
||||
opal_list_item_t super;
|
||||
orte_cellid_t cell; /* cell where this node is located */
|
||||
char *nodename; /* name of node */
|
||||
int32_t launch_id; /* launch id of node - needed by some systems */
|
||||
char *username;
|
||||
orte_process_name_t *daemon; /* name of the daemon on this node
|
||||
* NULL => daemon not assigned yet
|
||||
|
@ -62,7 +62,10 @@
|
||||
#define ORTE_JOBGRP_KEY "orte-jobgrp"
|
||||
#define ORTE_JOBID_KEY "orte-jobid"
|
||||
#define ORTE_VPID_KEY "orte-vpid"
|
||||
|
||||
/* NODE specific keys */
|
||||
#define ORTE_NODE_NAME_KEY "orte-node-name"
|
||||
#define ORTE_NODE_LAUNCH_ID_KEY "orte-node-launch-id"
|
||||
#define ORTE_NODE_ARCH_KEY "orte-node-arch"
|
||||
#define ORTE_NODE_STATE_KEY "orte-node-state"
|
||||
#define ORTE_NODE_SLOTS_KEY "orte-node-slots"
|
||||
@ -73,6 +76,8 @@
|
||||
#define ORTE_NODE_BOOTPROXY_KEY "orte-node-bootproxy"
|
||||
#define ORTE_NODE_USERNAME_KEY "orte-node-username"
|
||||
#define ORTE_NODE_OVERSUBSCRIBED_KEY "orte-node-oversubscribed"
|
||||
|
||||
/* JOB specific keys */
|
||||
#define ORTE_JOB_APP_CONTEXT_KEY "orte-job-app-context"
|
||||
#define ORTE_JOB_SLOTS_KEY "orte-job-slots" /**< number of procs in job */
|
||||
#define ORTE_JOB_VPID_START_KEY "orte-job-vpid-start"
|
||||
@ -82,6 +87,8 @@
|
||||
#define ORTE_JOB_IOF_KEY "orte-job-iof"
|
||||
#define ORTE_JOB_STATE_KEY "orte-job-state"
|
||||
#define ORTE_JOB_MAPPING_MODE_KEY "orte-job-mapping-mode"
|
||||
|
||||
/* PROCESS specific keys */
|
||||
#define ORTE_PROC_NAME_KEY "orte-proc-name"
|
||||
#define ORTE_PROC_RANK_KEY "orte-proc-rank"
|
||||
#define ORTE_PROC_PID_KEY "orte-proc-pid"
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user